## Introduction to Pandas

#### 1. Reading a csv file

In [None]:
# Check the current working directory
import os
os.getcwd()

In [None]:
# Set the working directory
os.chdir(r"path of directory")

In [31]:
import pandas as pd

In [26]:
#Read the cars.csv data
cars = pd.read_csv("datasets/cars.csv")

#### 2. Some initial steps with data

In [9]:
#A. Check the dimension of the data set
cars.shape

(406, 9)

In [10]:
#B. Number of rows in the data set
len(cars)

406

In [12]:
#OR Simply
cars.shape[0]

406

In [13]:
#C. Number of columns in the data set
cars.shape[1]

9

In [14]:
#OR
len(cars.columns)

9

In [16]:
cars.index

RangeIndex(start=0, stop=406, step=1)

In [15]:
#Studying the variable types
cars.dtypes

Car              object
MPG             float64
Cylinders         int64
Displacement    float64
Horsepower        int64
Weight            int64
Acceleration    float64
Model             int64
Origin           object
dtype: object

In [17]:
#D. Getting the variable names
cars.columns

Index(['Car', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', 'Model', 'Origin'],
      dtype='object')

In [18]:
#OR
list(cars)

['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [19]:
#E. Printing the first 5 lines of the data set
cars.head()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US


In [20]:
#F. Printing the first 10 lines of the data set
cars.head(10)

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US
5,Ford Galaxie 500,15.0,8,429.0,198,4341,10.0,70,US
6,Chevrolet Impala,14.0,8,454.0,220,4354,9.0,70,US
7,Plymouth Fury iii,14.0,8,440.0,215,4312,8.5,70,US
8,Pontiac Catalina,14.0,8,455.0,225,4425,10.0,70,US
9,AMC Ambassador DPL,15.0,8,390.0,190,3850,8.5,70,US


In [21]:
#G. Printing the last 5 lines of the data set
cars.tail()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
401,Ford Mustang GL,27.0,4,140.0,86,2790,15.6,82,US
402,Volkswagen Pickup,44.0,4,97.0,52,2130,24.6,82,Europe
403,Dodge Rampage,32.0,4,135.0,84,2295,11.6,82,US
404,Ford Ranger,28.0,4,120.0,79,2625,18.6,82,US
405,Chevy S-10,31.0,4,119.0,82,2720,19.4,82,US


In [None]:
#H. Printing the entire data
cars

In [23]:
cars.sample(5)              # random 5 rows

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
63,Plymouth Cricket,26.0,4,91.0,70,1955,20.5,71,US
65,Dodge Colt Hardtop,25.0,4,97.5,80,2126,17.0,72,US
175,Ford Pinto,23.0,4,140.0,83,2639,17.0,75,US
337,Renault Lecar Deluxe,40.9,4,85.0,0,1835,17.3,80,Europe
133,Ford Maverick,21.0,6,200.0,0,2875,17.0,74,US


In [24]:
cars.sample(5, random_state=5)

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
284,Peugeot 604sl,16.2,6,163.0,133,3410,15.8,78,Europe
71,Plymouth Fury III,15.0,8,318.0,150,4135,13.5,72,US
49,Dodge Monaco (sw),12.0,8,383.0,180,4955,11.5,71,US
335,Mercedes-Benz 240d,30.0,4,146.0,67,3250,21.8,80,Europe
43,Ford Torino 500,19.0,6,250.0,88,3302,15.5,71,US


#### 3. Choosing a single variable

In [27]:
#Method1 - Use Square brackets
cars["MPG"]

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
401    27.0
402    44.0
403    32.0
404    28.0
405    31.0
Name: MPG, Length: 406, dtype: float64

In [28]:
cars["Acceleration"]

0      12.0
1      11.5
2      11.0
3      12.0
4      10.5
       ... 
401    15.6
402    24.6
403    11.6
404    18.6
405    19.4
Name: Acceleration, Length: 406, dtype: float64

In [29]:
#Method2 - Use dot (.)
cars.Acceleration

0      12.0
1      11.5
2      11.0
3      12.0
4      10.5
       ... 
401    15.6
402    24.6
403    11.6
404    18.6
405    19.4
Name: Acceleration, Length: 406, dtype: float64

In [30]:
cars.Origin

0          US
1          US
2          US
3          US
4          US
        ...  
401        US
402    Europe
403        US
404        US
405        US
Name: Origin, Length: 406, dtype: object

#### 4. Some Basic Statistical Functions

In [35]:
cars.head()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US


In [36]:
#sum
cars.Weight.sum()

1209642

In [37]:
#mean
cars.Weight.mean()

2979.4137931034484

In [38]:
#median
cars.MPG.median()

22.35

In [39]:
#standard deviation
cars.MPG.std()

8.40177735227059

In [40]:
#variance
cars.MPG.var()

70.58986267712702

In [41]:
#minimum
cars.Cylinders.min()

3

In [42]:
#Maximum
cars.Cylinders.max()

8

In [43]:
#25th quantile
cars.MPG.quantile(0.25)

17.0

In [44]:
#90th quantile
cars.MPG.quantile(0.9)

34.25

In [45]:
#90th, 95th and 99th percentile
cars.MPG.quantile([0.9,0.95,0.99])

0.90    34.250
0.95    37.000
0.99    43.385
Name: MPG, dtype: float64

In [46]:
#Inter-quartile range
cars.MPG.quantile(0.75) - cars.MPG.quantile(0.25)

12.0

In [47]:
#describe
cars.MPG.describe()

count    406.000000
mean      23.051232
std        8.401777
min        0.000000
25%       17.000000
50%       22.350000
75%       29.000000
max       46.600000
Name: MPG, dtype: float64

### Some Functions and their Descriptions

    Function	Description
---------------------------------------------
       count	Number of non-null observations
         sum	Sum of values
        mean	Mean of values
         mad	Mean absolute deviation
      median	Arithmetic median of values
         min	Minimum
         max	Maximum
        mode	Mode
         abs	Absolute Value
        prod	Product of values
         std	Unbiased standard deviation
         var	Unbiased variance
         sem	Unbiased standard error of the mean
        skew	Unbiased skewness (3rd moment)
        kurt	Unbiased kurtosis (4th moment)
    quantile	Sample quantile (value at %)
      cumsum	Cumulative sum
     cumprod	Cumulative product
      cummax	Cumulative maximum
      cummin	Cumulative minimum