In [2]:
import numpy as np # NumPy stands for Numerical Python

data = [15, 16, 18, 19, 22, 24, 29, 30, 34] # 9 data points

print("mean:", np.mean(data)) # sum / 9
print("median:", np.median(data)) # midpoint (5th position)
print("50th percentile (median):", np.percentile(data, 50)) # same as median
print("25th percentile:", np.percentile(data, 25)) # 9 x .25 = 2.25, round up to 3rd position
print("75th percentile:", np.percentile(data, 75)) # 9 x .75 = 6.75, round up to 7th position
print("variance:", np.var(data)) # average of squared differences from the data set's mean
print("standard deviation:", np.std(data)) # square root of variance 
print()

mean: 23.0
median: 22.0
50th percentile (median): 22.0
25th percentile: 18.0
75th percentile: 29.0
variance: 40.22222222222222
standard deviation: 6.342099196813483



In [4]:
import pandas as pd # Pandas stands for Panel Data

pd.options.display.max_columns = 8 # allows all DataFrame columns to dispay without a '...' break
pd.options.display.width = 100 # allows all Dataframe columns to display without a line break (default = 80)

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') # 'read_csv()' takes a file in csv (comma seperated value) format and converts it to a Pandas DataFrame

print(df.head()) # check first 5 rows. ('1/0' converts to boolean True/False)
print()
print(df.sample(3)) # check 3 random rows!
print()

print(df.describe()) # check summary statistics 
print()

   Survived  Pclass     Sex   Age  Siblings/Spouses  Parents/Children     Fare
0         0       3    male  22.0                 1                 0   7.2500
1         1       1  female  38.0                 1                 0  71.2833
2         1       3  female  26.0                 0                 0   7.9250
3         1       1  female  35.0                 1                 0  53.1000
4         0       3    male  35.0                 0                 0   8.0500

     Survived  Pclass     Sex   Age  Siblings/Spouses  Parents/Children     Fare
315         0       2    male  54.0                 0                 0  14.0000
250         0       3  female  29.0                 1                 1  10.4625
716         1       2  female   6.0                 0                 1  33.0000

         Survived      Pclass         Age  Siblings/Spouses  Parents/Children       Fare
count  887.000000  887.000000  887.000000        887.000000        887.000000  887.00000
mean     0.385569    2

In [7]:
col = df['Fare'] # single brackets returns a Series - a labled 1d array 
print(col)
print()
col = df[['Fare']] # double brackets returns a DataFrame (2d) 
print(col)
print()
small_df = df[['Age', 'Sex', 'Survived']] # return a DataFrame of multiple columns
print(small_df.head())
print()

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
882    13.0000
883    30.0000
884    23.4500
885    30.0000
886     7.7500
Name: Fare, Length: 887, dtype: float64

        Fare
0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
..       ...
882  13.0000
883  30.0000
884  23.4500
885  30.0000
886   7.7500

[887 rows x 1 columns]

    Age     Sex  Survived
0  22.0    male         0
1  38.0  female         1
2  26.0  female         1
3  35.0  female         1
4  35.0    male         0



In [8]:
df['Male'] = df['Sex'] == 'male' # create a new column 'Male' with boolean True/False values to tell us if passenger is male or not. Booleans are easier for Python to do computations on than the strings in the 'Sex' column
print(df.head()) # the new column is added to the end of the DataFrame 

   Survived  Pclass     Sex   Age  Siblings/Spouses  Parents/Children     Fare   Male
0         0       3    male  22.0                 1                 0   7.2500   True
1         1       1  female  38.0                 1                 0  71.2833  False
2         1       3  female  26.0                 0                 0   7.9250  False
3         1       1  female  35.0                 1                 0  53.1000  False
4         0       3    male  35.0                 0                 0   8.0500   True


In [13]:
# Pandas DataFrames easier for humans to read, Numpy arrays better for calculations. DataFrames have indexes (column/row labels) arrays do not

print(df['Fare'].values[:25]) # '.values' returns Panda Series as 1d Numpy array ('[:25]' returns first 25 values only)
print()
print(df[['Pclass', 'Fare', 'Age']].values[:25]) # returns Pandas DataFrame as 2d Numpy array
print()
arr = df[['Pclass', 'Fare', 'Age']].values[:25]
print(arr.shape) # '.shape' returns the shape of the array, also works on DataFrames


[ 7.25   71.2833  7.925  53.1     8.05    8.4583 51.8625 21.075  11.1333
 30.0708 16.7    26.55    8.05   31.275   7.8542 16.     29.125  13.
 18.      7.225  26.     13.      8.0292 35.5    21.075 ]

[[ 3.      7.25   22.    ]
 [ 1.     71.2833 38.    ]
 [ 3.      7.925  26.    ]
 [ 1.     53.1    35.    ]
 [ 3.      8.05   35.    ]
 [ 3.      8.4583 27.    ]
 [ 1.     51.8625 54.    ]
 [ 3.     21.075   2.    ]
 [ 3.     11.1333 27.    ]
 [ 2.     30.0708 14.    ]
 [ 3.     16.7     4.    ]
 [ 1.     26.55   58.    ]
 [ 3.      8.05   20.    ]
 [ 3.     31.275  39.    ]
 [ 3.      7.8542 14.    ]
 [ 2.     16.     55.    ]
 [ 3.     29.125   2.    ]
 [ 2.     13.     23.    ]
 [ 3.     18.     31.    ]
 [ 3.      7.225  22.    ]
 [ 2.     26.     35.    ]
 [ 2.     13.     34.    ]
 [ 3.      8.0292 15.    ]
 [ 1.     35.5    28.    ]
 [ 3.     21.075   8.    ]]

(25, 3)


In [12]:
print(arr[0, 1]) # return value in first row, second column 
print(arr[0]) # return whole first row
print(arr[:,2]) # return whole third column

7.25
[ 3.    7.25 22.  ]
[22. 38. 26. 35. 35. 27. 54.  2. 27. 14.  4. 58. 20. 39. 14. 55.  2. 23.
 31. 22. 35. 34. 15. 28.  8.]


In [14]:
# masking to select all rows that meet a certain criteria 
mask = arr[:, 2] < 18 # create the mask. select whole third row ('Age'), return new array of boolean values of whether passenger age is under 18
print(arr[mask]) # apply mask to original array to return only the rows for passengers age<18
print()
print(arr[arr[:, 2] < 18]) # can do same thing in one line without defining 'mask' variable
print()

arr = df[['Pclass', 'Fare', 'Age']].values # same 2d Numpy array as above, this time for all 887 passangers not just first 25
mask = arr[:, 2] < 18
print(mask.sum()) # because True has value of 1 and False has value of 0, summing up the values in the mask array tells us how many of the passengers age<18
print((arr[:, 2] < 18).sum()) # same thing in one line
print()

# code and comments by github.com/alandavidgrunberg


[[ 3.     21.075   2.    ]
 [ 2.     30.0708 14.    ]
 [ 3.     16.7     4.    ]
 [ 3.      7.8542 14.    ]
 [ 3.     29.125   2.    ]
 [ 3.      8.0292 15.    ]
 [ 3.     21.075   8.    ]]

[[ 3.     21.075   2.    ]
 [ 2.     30.0708 14.    ]
 [ 3.     16.7     4.    ]
 [ 3.      7.8542 14.    ]
 [ 3.     29.125   2.    ]
 [ 3.      8.0292 15.    ]
 [ 3.     21.075   8.    ]]

130
130

