In [None]:
import pandas as pd

In [None]:
filepath = 'https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/master/data/election.csv'

In [None]:
my_df = pd.read_csv(filepath)

## Data Enrichment and Transformation

In [None]:
my_df.head()

Unnamed: 0,county,state,total,Obama,Romney,winner,voters,turnout,margin
0,Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667
1,Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399
2,Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293
3,Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012
4,Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118


In [None]:
cond_one = my_df.voters > 70000
my_df['new_col'] = cond_one

In [None]:
type(my_df)

pandas.core.frame.DataFrame

In [None]:
my_list = ['A','B','C']
for idx, x in enumerate(my_list):
  print(idx, x.lower())

0 a
1 b
2 c


In [None]:
[x.lower() for idx, x in enumerate(my_list)]

['a', 'b', 'c']

In [None]:
%%timeit
my_new_list = []
for idx, row in my_df.iterrows():
  row_winner = row.winner.lower()
  my_new_list.append(row_winner)

3.4 ms ± 679 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
my_df['new_col_2'] = my_new_list

In [None]:
%%timeit
my_df['new_col_2'] = my_df['winner'].apply(lambda x: x.lower())

347 µs ± 52.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%%timeit
my_df['new_col_2'] = [x.winner.lower() for idx, x in my_df.iterrows()]

3.66 ms ± 504 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
my_df.head()

Unnamed: 0,county,state,total,Obama,Romney,winner,voters,turnout,margin,new_col,new_col_2
0,Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667,False,romney
1,Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399,True,obama
2,Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293,False,romney
3,Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012,True,romney
4,Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118,False,romney


### Mapping

In [None]:
my_dict = {
    'romney': 'red',
    'obama': 'blue'
}

In [None]:
my_df['new_col_3'] = my_df['new_col_2'].map(my_dict)

In [None]:
my_df.head()

Unnamed: 0,county,state,total,Obama,Romney,winner,voters,turnout,margin,new_col,new_col_2,new_col_3
0,Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667,False,romney,red
1,Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399,True,obama,blue
2,Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293,False,romney,red
3,Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012,True,romney,red
4,Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118,False,romney,red


In [None]:
my_df.shape

(67, 12)

## Manipulation 

In [None]:
my_df2 = pd.pivot(
    data=my_df,
    index='state',
    columns='county',
    values='winner'
)

In [None]:
my_df2 = my_df.pivot(
    index='state',
    columns='county',
    values='winner'
)

In [None]:
my_df2

county,Adams,Allegheny,Armstrong,Beaver,Bedford,Berks,Blair,Bradford,Bucks,Butler,...,Susquehanna,Tioga,Union,Venango,Warren,Washington,Wayne,Westmoreland,Wyoming,York
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PA,Romney,Obama,Romney,Romney,Romney,Romney,Romney,Romney,Obama,Romney,...,Romney,Romney,Romney,Romney,Romney,Romney,Romney,Romney,Romney,Romney


In [None]:
new_df_pivot = my_df.pivot(
    index='county',
    columns='winner',
    values='margin'
).fillna(0)

In [None]:
my_df.pivot(
    index=['county', 'new_col_3'],
    columns='state',
    values=['Obama',	'Romney']
).fillna(0).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Obama,Romney
Unnamed: 0_level_1,state,PA,PA
county,new_col_3,Unnamed: 2_level_2,Unnamed: 3_level_2
Adams,red,35.482334,63.112001
Allegheny,blue,56.640219,42.18582
Armstrong,red,30.696985,67.901278
Beaver,red,46.032619,52.63763
Bedford,red,22.057452,76.98657


In quante province ha vinto Obama e quante Romney?

In [None]:
my_df['winner'].value_counts()

Romney    55
Obama     12
Name: winner, dtype: int64

In [None]:
my_df.pivot_table(
    index=['winner'],
    columns='state',
    values='voters',
    aggfunc='count'
)

state,PA
winner,Unnamed: 1_level_1
Obama,12
Romney,55


calcolo la media dei voters per candidato sulle province

In [None]:
my_df.pivot_table(
    index=['winner'],
    columns='state',
    values='voters',
    aggfunc='mean'
)

state,PA
winner,Unnamed: 1_level_1
Obama,387644.083333
Romney,70114.290909


In [None]:
my_df.pivot_table(
    index=['winner'],
    columns='state',
    values='voters',
    aggfunc='max'
)

state,PA
winner,Unnamed: 1_level_1
Obama,1099197
Romney,337822


## Un nuovo dataset da usare

In [None]:
filepath = 'https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/master/data/titanic.csv'
titanic_df = pd.read_csv(filepath)

In [None]:
titanic_df = pd.read_csv(filepath)

In [None]:
titanic_df.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [None]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
titanic_df.pivot_table(
    index='Pclass',
    columns='Embarked',
    values=['Fare', 'Age'],
    aggfunc=['mean', 'median']
)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median
Unnamed: 0_level_1,Age,Age,Age,Fare,Fare,Fare,Age,Age,Age,Fare,Fare,Fare
Embarked,C,Q,S,C,Q,S,C,Q,S,C,Q,S
Pclass,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1,38.027027,38.5,38.152037,104.718529,90.0,70.364862,36.5,38.5,37.0,78.2667,90.0,52.0
2,22.766667,43.5,30.386731,25.358335,12.35,20.327439,25.0,43.5,30.0,24.0,12.35,13.5
3,20.741951,25.9375,25.696552,11.214083,11.183393,14.644083,20.0,21.5,25.0,7.8958,7.75,8.05


In [None]:
titanic_df.pivot_table(
    ['Fare', 'Age'],
    index='Pclass',
    aggfunc=['mean', 'median']
)

Unnamed: 0_level_0,mean,mean,median,median
Unnamed: 0_level_1,Age,Fare,Age,Fare
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,38.233441,84.154687,37.0,60.2875
2,29.87763,20.662183,29.0,14.25
3,25.14062,13.67555,24.0,8.05


In [None]:
titanic_df[['Pclass', 'Fare', 'Age']].groupby('Pclass')['Fare', 'Age'].agg(['mean', 'median'])

  titanic_df[['Pclass', 'Fare', 'Age']].groupby('Pclass')['Fare', 'Age'].agg(['mean', 'median'])


Unnamed: 0_level_0,Fare,Fare,Age,Age
Unnamed: 0_level_1,mean,median,mean,median
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,84.154687,60.2875,38.233441,37.0
2,20.662183,14.25,29.87763,29.0
3,13.67555,8.05,25.14062,24.0


Calcolare media della survival tra genere ('Sex') e Pclass

In [None]:
titanic_df[['Pclass', 'Sex', 'Survived']].groupby(['Pclass', 'Sex']).agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
Pclass,Sex,Unnamed: 2_level_2
1,female,0.968085
1,male,0.368852
2,female,0.921053
2,male,0.157407
3,female,0.5
3,male,0.135447


In [None]:
titanic_df.pivot_table(
    'Survived',
    index=['Pclass', 'Sex'],
    aggfunc='mean',
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Pclass,Sex,Unnamed: 2_level_1
1,female,0.968085
1,male,0.368852
2,female,0.921053
2,male,0.157407
3,female,0.5
3,male,0.135447


In [None]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# home: vedere a quale classe appartengono le persone per il quale l'eta non è stata identificata

## Imputazione dei Missing Values con Scikit-learn

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
si = SimpleImputer()
pd.DataFrame(si.fit_transform(titanic_df[['PassengerId', 'Age']]))

Unnamed: 0,0,1
0,1.0,22.000000
1,2.0,38.000000
2,3.0,26.000000
3,4.0,35.000000
4,5.0,35.000000
...,...,...
886,887.0,27.000000
887,888.0,19.000000
888,889.0,29.699118
889,890.0,26.000000
