In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([
    [1.5, 2.5, np.nan, "blue"],
    [10, np.nan, 1000, "red"],
    [np.nan, np.nan, np.nan, np.nan],
    [2, 0, 4, "red"],
    [40, 40, 40, "blue"],
    [1, 2, 3, np.nan]],
    columns = ["f1", "f2", "f3", "f4"]
)

In [3]:
df

Unnamed: 0,f1,f2,f3,f4
0,1.5,2.5,,blue
1,10.0,,1000.0,red
2,,,,
3,2.0,0.0,4.0,red
4,40.0,40.0,40.0,blue
5,1.0,2.0,3.0,


In [4]:
df.isnull().sum()

f1    1
f2    2
f3    2
f4    2
dtype: int64

In [5]:
df.isna()

Unnamed: 0,f1,f2,f3,f4
0,False,False,True,False
1,False,True,False,False
2,True,True,True,True
3,False,False,False,False
4,False,False,False,False
5,False,False,False,True


In [6]:
df.isnull().mean()

f1    0.166667
f2    0.333333
f3    0.333333
f4    0.333333
dtype: float64

In [7]:
pd.isna(df["f1"])

0    False
1    False
2     True
3    False
4    False
5    False
Name: f1, dtype: bool

In [8]:
df.fillna(-1)

Unnamed: 0,f1,f2,f3,f4
0,1.5,2.5,-1.0,blue
1,10.0,-1.0,1000.0,red
2,-1.0,-1.0,-1.0,-1
3,2.0,0.0,4.0,red
4,40.0,40.0,40.0,blue
5,1.0,2.0,3.0,-1


In [9]:
df.mean()

f1     10.900
f2     11.125
f3    261.750
dtype: float64

In [10]:
df.fillna(df.mean())

Unnamed: 0,f1,f2,f3,f4
0,1.5,2.5,261.75,blue
1,10.0,11.125,1000.0,red
2,10.9,11.125,261.75,
3,2.0,0.0,4.0,red
4,40.0,40.0,40.0,blue
5,1.0,2.0,3.0,


In [11]:
df.dropna()

Unnamed: 0,f1,f2,f3,f4
3,2.0,0.0,4.0,red
4,40.0,40.0,40.0,blue


In [12]:
df.dropna(subset = ["f2"])

Unnamed: 0,f1,f2,f3,f4
0,1.5,2.5,,blue
3,2.0,0.0,4.0,red
4,40.0,40.0,40.0,blue
5,1.0,2.0,3.0,


In [13]:
pd.get_dummies(df)

Unnamed: 0,f1,f2,f3,f4_blue,f4_red
0,1.5,2.5,,1,0
1,10.0,,1000.0,0,1
2,,,,0,0
3,2.0,0.0,4.0,0,1
4,40.0,40.0,40.0,1,0
5,1.0,2.0,3.0,0,0


In [14]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [15]:
imr = SimpleImputer(missing_values = np.nan, strategy = "mean")

In [17]:
imr = imr.fit(df.drop(["f2", "f4"], axis = 1))

In [18]:
imputed_df = imr.transform(df.drop(["f2", "f4"], axis = 1).values)

In [19]:
imputed_df

array([[   1.5 ,  261.75],
       [  10.  , 1000.  ],
       [  10.9 ,  261.75],
       [   2.  ,    4.  ],
       [  40.  ,   40.  ],
       [   1.  ,    3.  ]])

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
ourData = [[10, 10, 10,], [20, 12, 30], [30, 20, 70]]

In [22]:
scaler = MinMaxScaler()
scaler.fit_transform(ourData)

array([[0.        , 0.        , 0.        ],
       [0.5       , 0.2       , 0.33333333],
       [1.        , 1.        , 1.        ]])

In [25]:
from sklearn.preprocessing import StandardScaler

In [28]:
scaler = StandardScaler()
sc = scaler.fit_transform(ourData)
sc

array([[-1.22474487, -0.9258201 , -1.06904497],
       [ 0.        , -0.46291005, -0.26726124],
       [ 1.22474487,  1.38873015,  1.33630621]])

In [31]:
sc.mean(axis = 0)

array([0.00000000e+00, 0.00000000e+00, 7.40148683e-17])

In [32]:
sc.std(axis = 0)

array([1., 1., 1.])

In [33]:
df = pd.read_csv("titanic.csv")

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [35]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [36]:
df.head() # Optional: Integer length param

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [38]:
df.shape

(891, 12)

In [39]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [40]:
df.drop(["PassengerId", "Name", "Ticket"], axis = 1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


In [42]:
df.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [48]:
df[df["Embarked"].isna()] # Missing embarked columns

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [45]:
df[df["Age"].isna()] # Rows with missing age

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [46]:
df[df["Age"].isna()].groupby("Sex").size() # Missing ages grouped by sex

Sex
female     53
male      124
dtype: int64

In [47]:
pd.get_dummies(df["Embarked"])

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [51]:
df[(df["Age"] > 0) & (df["Age"] < 10)] # People between 0-10 years old

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.00,3,1,349909,21.0750,,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.00,1,1,PP 9549,16.7000,G6,S
16,17,0,3,"Rice, Master. Eugene",male,2.00,4,1,382652,29.1250,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.00,3,1,349909,21.0750,,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.00,1,2,SC/Paris 2123,41.5792,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
827,828,1,2,"Mallet, Master. Andre",male,1.00,0,2,S.C./PARIS 2079,37.0042,,C
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.00,4,2,347082,31.2750,,S
852,853,0,3,"Boulos, Miss. Nourelain",female,9.00,1,1,2678,15.2458,,C


In [52]:
df.drop(["Embarked"], axis = 1, inplace = True)

In [55]:
df.sample(n = 7) # Pull a random sample from the DF

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
603,604,0,3,"Torber, Mr. Ernst William",male,44.0,0,0,364511,8.05,
90,91,0,3,"Christmann, Mr. Emil",male,29.0,0,0,343276,8.05,
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,
723,724,0,2,"Hodges, Mr. Henry Price",male,50.0,0,0,250643,13.0,
249,250,0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,244252,26.0,
273,274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7,C118
517,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,


In [59]:
df["Cabin"].str.contains("C", na = False) # Optional: Add .sum() to get the count

0      False
1       True
2      False
3       True
4      False
       ...  
886    False
887    False
888    False
889     True
890    False
Name: Cabin, Length: 891, dtype: bool

In [60]:
df.groupby(["Sex"]).size()

Sex
female    314
male      577
dtype: int64

In [61]:
df.groupby(["Sex"]).Fare.mean()

Sex
female    44.479818
male      25.523893
Name: Fare, dtype: float64

In [62]:
pd.crosstab(df["Survived"], df["Sex"]) # Break down results by categorical variables

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [64]:
df.groupby(["Pclass", "Sex"])["Fare"].mean()

Pclass  Sex   
1       female    106.125798
        male       67.226127
2       female     21.970121
        male       19.741782
3       female     16.118810
        male       12.661633
Name: Fare, dtype: float64

In [66]:
df[df["Age"] < 10].groupby("Sex").mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,421.9,0.633333,2.666667,4.416667,1.533333,1.433333,27.972637
male,427.40625,0.59375,2.59375,3.770938,2.15625,1.375,33.017969
