==== Creating Series ====

In [1]:
#import NumPy and Pandas into your Python project
import numpy as np
import pandas as pd

In [2]:
# create a Pandas series from a Python list.
car_manufacturers = ['Volkswagen','Ford','Mercedes-Benz','BMW','Nissan']
pds_car_manufacturers = pd.Series(data=car_manufacturers)
print (pds_car_manufacturers)
print (pds_car_manufacturers[2])

0       Volkswagen
1             Ford
2    Mercedes-Benz
3              BMW
4           Nissan
dtype: object
Mercedes-Benz


In [3]:
# create a Pandas series from a Python dictionary
#
# Pandas does not generate a series index.
cars = {'RJ09VWQ':'Blue Volkswagen Polo',
        'WQ81R09':'Red Ford Focus',
        'PB810AQ':'White Mercedes-Benz E-Class',
        'TU914A8':'Silver BMW 1 Series'}
pds_cars = pd.Series(data=cars)
print (pds_cars)
print (pds_cars[2])
print (pds_cars['WQ81R09'])

RJ09VWQ           Blue Volkswagen Polo
WQ81R09                 Red Ford Focus
PB810AQ    White Mercedes-Benz E-Class
TU914A8            Silver BMW 1 Series
dtype: object
White Mercedes-Benz E-Class
Red Ford Focus


In [4]:
# you can access the indices and the values of a series individually
print (pds_cars.index)
print (pds_cars.values)

Index(['RJ09VWQ', 'WQ81R09', 'PB810AQ', 'TU914A8'], dtype='object')
['Blue Volkswagen Polo' 'Red Ford Focus' 'White Mercedes-Benz E-Class'
 'Silver BMW 1 Series']


In [5]:
# load the contents of a file into a pandas Dataframe
input_file = './titanic_dataset/original/train.csv'
df_iris = pd.read_csv(input_file)

# print the names of the columns
print (df_iris.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [6]:
# create a Dataframe with a subset of the columns in df_iris
df_iris_subset = df_iris[['PassengerId', 'Survived', 'Pclass', 'Sex','Fare', 'Age']]

==== Getting information on a Dataframe ====

In [7]:
# how many rows and columns in the dataset
print (df_iris_subset.shape)

# print first 5 rows
print (df_iris_subset.head())

# print last 3 rows
print (df_iris_subset.tail(3))

# print a random sample of 10 rows
print (df_iris_subset.sample(10))

(891, 6)
   PassengerId  Survived  Pclass     Sex     Fare   Age
0            1         0       3    male   7.2500  22.0
1            2         1       1  female  71.2833  38.0
2            3         1       3  female   7.9250  26.0
3            4         1       1  female  53.1000  35.0
4            5         0       3    male   8.0500  35.0
     PassengerId  Survived  Pclass     Sex   Fare   Age
888          889         0       3  female  23.45   NaN
889          890         1       1    male  30.00  26.0
890          891         0       3    male   7.75  32.0
     PassengerId  Survived  Pclass     Sex     Fare   Age
133          134         1       2  female  26.0000  29.0
878          879         0       3    male   7.8958   NaN
451          452         0       3    male  19.9667   NaN
149          150         0       2    male  13.0000  42.0
57            58         0       3    male   7.2292  28.5
443          444         1       2  female  13.0000  28.0
517          518         

In [8]:
# get information on data types and memory footprint
print (df_iris_subset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Fare           891 non-null float64
Age            714 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 41.8+ KB
None


In [9]:
# get statistical information on numeric columns
print (df_iris_subset.describe())

       PassengerId    Survived      Pclass        Fare         Age
count   891.000000  891.000000  891.000000  891.000000  714.000000
mean    446.000000    0.383838    2.308642   32.204208   29.699118
std     257.353842    0.486592    0.836071   49.693429   14.526497
min       1.000000    0.000000    1.000000    0.000000    0.420000
25%     223.500000    0.000000    2.000000    7.910400   20.125000
50%     446.000000    0.000000    3.000000   14.454200   28.000000
75%     668.500000    1.000000    3.000000   31.000000   38.000000
max     891.000000    1.000000    3.000000  512.329200   80.000000


In [10]:
# mean of all columns
print (df_iris_subset.mean())

# the following statement is identical to the previous one, 
# as axis = 0 implies columns.
print (df_iris_subset.mean(axis=0))

#correlation between columns
print (df_iris_subset.corr())

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Fare            32.204208
Age             29.699118
dtype: float64
PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Fare            32.204208
Age             29.699118
dtype: float64
             PassengerId  Survived    Pclass      Fare       Age
PassengerId     1.000000 -0.005007 -0.035144  0.012658  0.036847
Survived       -0.005007  1.000000 -0.338481  0.257307 -0.077221
Pclass         -0.035144 -0.338481  1.000000 -0.549500 -0.369226
Fare            0.012658  0.257307 -0.549500  1.000000  0.096067
Age             0.036847 -0.077221 -0.369226  0.096067  1.000000


In [11]:
# highlight the null values in a random sample of data
print (df_iris_subset.sample(10).isnull())
       
# find out if there are any missing values in the data
print (df_iris_subset.isnull().sum())

# number of non-null values in each column
print (df_iris_subset.count())

     PassengerId  Survived  Pclass    Sex   Fare    Age
147        False     False   False  False  False  False
555        False     False   False  False  False  False
513        False     False   False  False  False  False
51         False     False   False  False  False  False
159        False     False   False  False  False   True
860        False     False   False  False  False  False
728        False     False   False  False  False  False
596        False     False   False  False  False   True
561        False     False   False  False  False  False
39         False     False   False  False  False  False
PassengerId      0
Survived         0
Pclass           0
Sex              0
Fare             0
Age            177
dtype: int64
PassengerId    891
Survived       891
Pclass         891
Sex            891
Fare           891
Age            714
dtype: int64


==== selecting data ====

In [12]:
# extract a single column as a series object
pds_class = df_iris_subset[['Pclass']]
print (pds_class.head())

# extract a specific subset of named columns into a new dataframe
df_test1 = df_iris_subset[['PassengerId', 'Age']]
print (df_test1.head())

# extract first 3 rows into a new data frame
df_test2 = df_iris_subset[0:3]
print (df_test2.head())

# extract first 3 rows and 3 columns into a new dataframe
df_test3 = df_iris_subset.iloc[0:3,0:3]
print (df_test3.head())

# extracting all rows where Age > 26 into a new dataframe
df_test4 = df_iris_subset[df_iris_subset['Age'] > 26]
print (df_test4.count())

   Pclass
0       3
1       1
2       3
3       1
4       3
   PassengerId   Age
0            1  22.0
1            2  38.0
2            3  26.0
3            4  35.0
4            5  35.0
   PassengerId  Survived  Pclass     Sex     Fare   Age
0            1         0       3    male   7.2500  22.0
1            2         1       1  female  71.2833  38.0
2            3         1       3  female   7.9250  26.0
   PassengerId  Survived  Pclass
0            1         0       3
1            2         1       1
2            3         1       3
PassengerId    395
Survived       395
Pclass         395
Sex            395
Fare           395
Age            395
dtype: int64
