In [1]:
import pandas as pd
import numpy as np

In [2]:
array_a = np.array(['U', 'Love', 'Python'])

In [3]:
series_from_arr = pd.Series(array_a)
series_from_arr

0         U
1      Love
2    Python
dtype: object

In [4]:
type(series_from_arr)

pandas.core.series.Series

##### Creating Series with Index from array

In [5]:
series_from_arr_index = pd.Series(array_a, index=['a', 'b', 'c'])
series_from_arr_index

a         U
b      Love
c    Python
dtype: object

**Note**: Series can be created from Python List similar to the above by passing the list as an argument to `pd.Series()`

##### Using Python Dictionary

In [6]:
dict_a = {'Ram': 90, 'Hari': 86.5, 'Gita': 87.3}

In [7]:
series_from_dict = pd.Series(dict_a)
series_from_dict

Ram     90.0
Hari    86.5
Gita    87.3
dtype: float64

In [8]:
# Accessing the keys or indices
series_from_dict.index

Index(['Ram', 'Hari', 'Gita'], dtype='object')

In [9]:
# Accessing values
series_from_dict.values

array([90. , 86.5, 87.3])

### Creating Pandas DataFrames

Using Python List

In [10]:
list_a = ['Python', 'Ruby', 'Rust', 'Java', 'Go']

In [11]:
df_from_list = pd.DataFrame(list_a)
df_from_list

Unnamed: 0,0
0,Python
1,Ruby
2,Rust
3,Java
4,Go


In [12]:
# To create a column name use the following
df_from_list_col_name = pd.DataFrame(list_a, columns=['Programming Language'])
df_from_list_col_name

Unnamed: 0,Programming Language
0,Python
1,Ruby
2,Rust
3,Java
4,Go


Creating DataFrame form List of list

In [13]:
list_b = [['Ram Thapa', 'Koteshowr'], ['Nitesh Rai', 'London']]

In [14]:
df1 = pd.DataFrame(list_b, columns=['Name', 'Address'])
df1

Unnamed: 0,Name,Address
0,Ram Thapa,Koteshowr
1,Nitesh Rai,London


Using Python Dictionary

In [15]:
dict_b = {'Name': ['Ram Thapa', 'Nitesh Rai'], 'Address': ['Koteshowr', 'London']}

In [16]:
df2 = pd.DataFrame(dict_b)
df2

Unnamed: 0,Name,Address
0,Ram Thapa,Koteshowr
1,Nitesh Rai,London


In [17]:
# Create a row label
df_from_dict_labels = pd.DataFrame(dict_b, index=['A', 'B'])
df_from_dict_labels

Unnamed: 0,Name,Address
A,Ram Thapa,Koteshowr
B,Nitesh Rai,London


#### Indexing and Slicing Pandas Series

Indexing by Item name

In [18]:
series_from_arr_index

a         U
b      Love
c    Python
dtype: object

In [19]:
series_from_arr_index['c']

'Python'

In [20]:
# Alphabetical indexing is (inclusive)
series_from_arr_index['a':'c']

a         U
b      Love
c    Python
dtype: object

In [21]:
# Numerical indexing is (exclusive)
series_from_arr_index[0:2]

a       U
b    Love
dtype: object

### Reading a csv file in Pandas

In [22]:
car_data = pd.read_csv('CarPrice.csv')

In [23]:
# First 5 rows
car_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [24]:
# Last 10 rows
car_data.tail(10)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
195,196,-1,volvo 144ea,gas,std,four,wagon,rwd,front,104.3,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,13415.0
196,197,-2,volvo 244dl,gas,std,four,sedan,rwd,front,104.3,...,141,mpfi,3.78,3.15,9.5,114,5400,24,28,15985.0
197,198,-1,volvo 245,gas,std,four,wagon,rwd,front,104.3,...,141,mpfi,3.78,3.15,9.5,114,5400,24,28,16515.0
198,199,-2,volvo 264gl,gas,turbo,four,sedan,rwd,front,104.3,...,130,mpfi,3.62,3.15,7.5,162,5100,17,22,18420.0
199,200,-1,volvo diesel,gas,turbo,four,wagon,rwd,front,104.3,...,130,mpfi,3.62,3.15,7.5,162,5100,17,22,18950.0
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470.0
204,205,-1,volvo 264gl,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625.0


In [25]:
# Check column names
car_data.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [26]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

**Note**: objects are string in DataFrames

In [27]:
# Basic statistical inforamtion of numeric values from dataset
car_data.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [28]:
# Description of objects
car_data.describe(include='object')

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
count,205,205,205,205,205,205,205,205,205,205
unique,147,2,2,2,5,3,2,7,7,8
top,toyota corona,gas,std,four,sedan,fwd,front,ohc,four,mpfi
freq,6,185,168,115,96,120,202,148,159,94


In [29]:
# Car names and their occurances
car_data['CarName'].value_counts()

CarName
toyota corona           6
toyota corolla          6
peugeot 504             6
subaru dl               4
mitsubishi mirage g4    3
                       ..
mazda glc 4             1
mazda rx2 coupe         1
maxda glc deluxe        1
maxda rx3               1
volvo 246               1
Name: count, Length: 147, dtype: int64

#### Checking null values

In [30]:
car_data_null = pd.read_csv('CarPrice (copy).csv')

In [31]:
# Checking if the data contains null values (N/A)
car_data_null.isna().sum()

car_ID              0
symboling           0
CarName             2
fueltype            0
aspiration          2
doornumber          0
carbody             2
drivewheel          0
enginelocation      0
wheelbase           2
carlength           0
carwidth            0
carheight           0
curbweight          1
enginetype          1
cylindernumber      3
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

**Note**: If data is numeric we fill using either `Mean` or `Median` and for non-numeric data we fill using the data with most frequency (`Mode`).

In [32]:
car_data_no_null = car_data_null.dropna()

In [33]:
car_data_no_null.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [34]:
# Length of non-null dataset
len(car_data_no_null)

193

In [35]:
# Length of null dataset
len(car_data_null)

205

In [36]:
car_data_null['carbody']

0      convertible
1      convertible
2        hatchback
3            sedan
4            sedan
          ...     
200          sedan
201          sedan
202          sedan
203          sedan
204          sedan
Name: carbody, Length: 205, dtype: object

#### Filling missing values - Categorical data

In [37]:
car_data_null['carbody'].isna().sum()

2

In [38]:
# It still doesn't fill the dataset as specified
car_data_null['carbody'].fillna("TESLA")

0      convertible
1      convertible
2        hatchback
3            sedan
4            sedan
          ...     
200          sedan
201          sedan
202          sedan
203          sedan
204          sedan
Name: carbody, Length: 205, dtype: object

In [39]:
car_data_null['carbody'].isna().sum()

2

#### Option 1 for filling the data

In [40]:
car_data_null['carbody_filled'] = car_data_null['carbody'].fillna('TESLA')

In [41]:
car_data_null['carbody_filled'].isna().sum()

0

In [42]:
# Get the data from dataset where carbody_filled is equal to 'TESLA'
car_data_null[car_data_null['carbody_filled'] == 'TESLA']

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carbody_filled
9,10,0,audi 5000s (diesel),gas,turbo,two,,4wd,front,99.5,...,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167,TESLA
18,19,2,chevrolet impala,gas,std,two,,fwd,front,88.4,...,2bbl,2.91,3.03,9.5,48,5100,47,53,5151.0,TESLA


#### Option 2 for filling the data

In [43]:
# Fills the N/A values with specified value directly in the column
car_data_null['carbody'].fillna('TESLA', inplace=True)

In [44]:
# Get the data from dataset where carbody_filled is equal to 'TESLA'
car_data_null[car_data_null['carbody_filled'] == 'TESLA']

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carbody_filled
9,10,0,audi 5000s (diesel),gas,turbo,two,TESLA,4wd,front,99.5,...,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167,TESLA
18,19,2,chevrolet impala,gas,std,two,TESLA,fwd,front,88.4,...,2bbl,2.91,3.03,9.5,48,5100,47,53,5151.0,TESLA


#### Accessing using iloc() and loc()
- `iloc()`:
    - It is an indexed-based selecting method which means that we have to pass an integer index in the method to select a specific row/column.
    - Doesn't accept boolean data.
- `loc()`:
    - It is label based data selecting method which means that we have to pass the name of the row or column which we want to select.
    - Accepts boolean data.

In [45]:
car_data_null[car_data_null['carbody_filled'] == 'TESLA'].iloc[1]

car_ID                            19
symboling                          2
CarName             chevrolet impala
fueltype                         gas
aspiration                       std
doornumber                       two
carbody                        TESLA
drivewheel                       fwd
enginelocation                 front
wheelbase                       88.4
carlength                      141.1
carwidth                        60.3
carheight                       53.2
curbweight                    1488.0
enginetype                         l
cylindernumber                 three
enginesize                        61
fuelsystem                      2bbl
boreratio                       2.91
stroke                          3.03
compressionratio                 9.5
horsepower                        48
peakrpm                         5100
citympg                           47
highwaympg                        53
price                         5151.0
carbody_filled                 TESLA
N

In [46]:
car_data['wheelbase'].min()

86.6

In [47]:
car_data['wheelbase'].max()

120.9

In [48]:
mean_wheelbase = car_data['wheelbase'].mean()
mean_wheelbase

98.75658536585367

In [49]:
median_wheelbase = car_data['wheelbase'].median()
median_wheelbase

97.0

In [50]:
car_data_null['wheelbase'].fillna(mean_wheelbase, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_data_null['wheelbase'].fillna(mean_wheelbase, inplace=True)


In [51]:
car_data_null['wheelbase'].isna().sum()

0

In [52]:
car_data_null['aspiration']

0        std
1        std
2        std
3        std
4        std
       ...  
200      std
201    turbo
202      std
203    turbo
204    turbo
Name: aspiration, Length: 205, dtype: object

In [53]:
# There can be data with same frequency (modes) so we access the first index of mode (array)
car_data_null['aspiration'].fillna(car_data['aspiration'].mode()[0], inplace=True)

In [54]:
car_data_null['aspiration'].isna().sum()

0

In [55]:
car_data_null['carbody'] = car_data_null['carbody'].replace({'TESLA': 'convertible'})

In [56]:
car_data_null[car_data_null['carbody'] == 'convertible']

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carbody_filled
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,convertible
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,convertible
9,10,0,audi 5000s (diesel),gas,turbo,two,convertible,4wd,front,99.5,...,mpfi,3.13,3.4,7.0,160,5500,16,22,17859.167,TESLA
18,19,2,chevrolet impala,gas,std,two,convertible,fwd,front,88.4,...,2bbl,2.91,3.03,9.5,48,5100,47,53,5151.0,TESLA
72,73,3,buick skylark,gas,std,two,convertible,rwd,front,96.6,...,mpfi,3.46,3.1,8.3,155,4750,16,18,35056.0,convertible
128,129,3,porsche boxter,gas,std,two,convertible,rwd,rear,89.5,...,mpfi,3.74,2.9,9.5,207,5900,17,25,37028.0,convertible
172,173,2,toyota cressida,gas,std,two,convertible,rwd,front,98.4,...,mpfi,3.62,3.5,9.3,116,4800,24,30,17669.0,convertible
189,190,3,vw dasher,gas,std,two,convertible,fwd,front,94.5,...,mpfi,3.19,3.4,8.5,90,5500,24,29,11595.0,convertible


In [57]:
car_data_null['CarName'].fillna(car_data['CarName'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_data_null['CarName'].fillna(car_data['CarName'].mode()[0], inplace=True)


In [58]:
car_data['CarName'].mode()[0]

'peugeot 504'

In [59]:
car_data_null[car_data_null['CarName'] == car_data['CarName'].mode()[0]]

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carbody_filled
10,11,2,peugeot 504,gas,std,two,sedan,rwd,front,101.2,...,mpfi,3.5,2.8,8.8,101,5800,23,29,16430.0,sedan
20,21,0,peugeot 504,gas,std,four,sedan,fwd,front,94.5,...,2bbl,3.03,3.11,9.6,70,5400,38,43,6575.0,sedan
107,108,0,peugeot 504,gas,std,four,sedan,rwd,front,107.9,...,mpfi,3.46,3.19,8.4,97,5000,19,24,11900.0,sedan
110,111,0,peugeot 504,diesel,turbo,four,wagon,rwd,front,114.2,...,idi,3.7,3.52,21.0,95,4150,25,25,13860.0,wagon
111,112,0,peugeot 504,gas,std,four,sedan,rwd,front,107.9,...,mpfi,3.46,2.19,8.4,95,5000,19,24,15580.0,sedan
113,114,0,peugeot 504,gas,std,four,wagon,rwd,front,114.2,...,mpfi,3.46,2.19,8.4,95,5000,19,24,16695.0,wagon
115,116,0,peugeot 504,gas,std,four,sedan,rwd,front,107.9,...,mpfi,3.46,3.19,8.4,97,5000,19,24,16630.0,sedan
116,117,0,peugeot 504,diesel,turbo,four,sedan,rwd,front,107.9,...,idi,3.7,3.52,21.0,95,4150,28,33,17950.0,sedan


In [64]:
# Fetching the row with null values
car_data_null[car_data_null['cylindernumber'].isna()]

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,carbody_filled
7,8,1,audi 5000,gas,std,four,wagon,fwd,front,105.8,...,mpfi,3.19,3.4,8.5,110,5500,19,25,18920.0,wagon
12,13,0,bmw x1,gas,std,two,sedan,rwd,front,101.2,...,mpfi,3.31,3.19,9.0,121,4250,21,28,20970.0,sedan
23,24,1,dodge d200,gas,turbo,two,hatchback,fwd,front,93.7,...,mpfi,3.03,3.39,7.6,102,5500,24,30,7957.0,hatchback


# Normalization
 - For instance: we have pixel values ranging from 0-255
 - We divide each pixel value by 255 i.e. x / 255 which converts the pixel values to the range of 0-1   

In [72]:
from sklearn.preprocessing import LabelEncoder

In [73]:
label_enc = LabelEncoder()

In [74]:
X = ['A', 'B']

In [75]:
X_scaled = label_enc.fit_transform(X)
X_scaled

array([0, 1])

In [69]:
car_data_null['enginetype'].fillna(car_data['enginetype'].mode()[0], inplace=True)

In [76]:
car_data_null['enginetype_enc'] = label_enc.fit_transform(car_data_null['enginetype'])

In [77]:
car_data_null[['enginetype', 'enginetype_enc']]

Unnamed: 0,enginetype,enginetype_enc
0,dohc,0
1,dohc,0
2,ohcv,5
3,ohc,3
4,ohc,3
...,...,...
200,ohc,3
201,ohc,3
202,ohcv,5
203,ohc,3


In [81]:
car_data_null['fueltype_enc'] = label_enc.fit_transform(car_data_null['fueltype'])

In [86]:
car_data_null[['fueltype', 'fueltype_enc']].tail(10)

Unnamed: 0,fueltype,fueltype_enc
195,gas,1
196,gas,1
197,gas,1
198,gas,1
199,gas,1
200,gas,1
201,gas,1
202,gas,1
203,diesel,0
204,gas,1
