# Introduction to pandas

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Metodo dinamico

In [29]:
dfl = pd.DataFrame([[1.2, 2], [9, 10]], 
                  index=['riga_1', 'riga_2'],
                  columns=['col_1', 'col_2']
                 )

In [30]:
dfl.dtypes

col_1    float64
col_2      int64
dtype: object

In [43]:
dfd = pd.DataFrame([
    {'nome': 'alfio', 'cognome': 'ferrara', 'eta': 47},
    {'nome': 'diego', 'cognome': 'ciccone', 'eta': 41},
])

In [45]:
dfd.dtypes

nome       object
cognome    object
eta         int64
dtype: object

## Metodo statico

### Esempio Titanic
[https://www.kaggle.com/competitions/titanic/data?select=test.csv](https://www.kaggle.com/competitions/titanic/data?select=test.csv)

In [51]:
titanic_file = '/Users/flint/Data/kaggle/titanic/train.csv'

In [52]:
def sex_conv(x):
    if x == 'male':
        return 'M'
    elif x == 'female':
        return 'F'
    else:
        return np.nan

converter = {
    'Sex': sex_conv
}

In [53]:
dft = pd.read_csv(titanic_file, converters=converter)

In [57]:
dft.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",M,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",M,32.0,0,0,370376,7.75,,Q


### Read from SQL

In [58]:
from sqlalchemy import create_engine
import json

with open('/Users/flint/Data/postgresql/conf.json', 'r') as inj:
    conf = json.load(inj)

db = 'nlp'
schema = 'yelp'
psw = conf['psw'] 
engine = create_engine(
    "postgresql://flint:{}@localhost:5432/{}".format(psw, db),
                       connect_args={'options': '-csearch_path={}'.format(schema)})

In [77]:
def neigh(x):
    if x is None:
        return np.nan
    else:
        return x

In [85]:
sql = """
SELECT B.id AS bid, B.name, B.stars, B.lon, B.lat, C.*
FROM yelp.business AS B 
JOIN yelp.city AS C ON B.location = C.id
"""

dfs = pd.read_sql(sql, 
                  engine, index_col='bid')

### Pseudo SQL

#### Select

In [95]:
dfs[(dfs['state'] == 'AZ') | (dfs['stars'] > 3)].head(2)

Unnamed: 0_level_0,name,stars,lon,lat,id,name,state
bid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FYWN1wneV18bWNgQjJ2GNg,Dental by Design,4.0,-111.978599,33.33069,D66C6A,Ahwatukee,AZ
KQPW8lFf1y5BT2MxiSZ3QA,Western Motor Vehicle,1.5,-112.11531,33.524903,185B21,Phoenix,AZ


#### Projection

In [98]:
dfs[(dfs['state'] == 'AZ') | (dfs['stars'] > 3)][['lon', 'lat']].head(2)

Unnamed: 0_level_0,lon,lat
bid,Unnamed: 1_level_1,Unnamed: 2_level_1
FYWN1wneV18bWNgQjJ2GNg,-111.978599,33.33069
KQPW8lFf1y5BT2MxiSZ3QA,-112.11531,33.524903


#### Group by and sort

In [105]:
dfs.groupby('state').std().sort_values('stars', ascending=False)

Unnamed: 0_level_0,stars,lon,lat
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR,2.474874,0.220792,0.072705
NTH,2.121320,0.496785,3.375282
NY,1.202190,9.837088,2.060925
6,1.154701,0.010478,0.005159
VT,1.060660,0.021282,0.014514
...,...,...,...
TAM,,,
VA,,,
WA,,,
WHT,,,
