# Pandas in python

Pandas is used in data science for cleaning the data, analysing data, manipulating the data,etc

In [1]:
import pandas as pd
pd.__version__

'2.0.3'

# DataFrame

A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [2]:
dataset = {
    'car' : ['Baleno', 'Swift', 'Harrier', 'Punch', 'Fortuner'],
    'YoM' : [2017, 2015, 2020, 2022, 2018],
    'Top Speed' : [150, 200, 200, 250, 250]
}
myvar = pd.DataFrame(dataset, index=['Car1', 'Car2', 'Car3', 'Car4', 'Car5'])
pd.DataFrame(dataset)

Unnamed: 0,car,YoM,Top Speed
0,Baleno,2017,150
1,Swift,2015,200
2,Harrier,2020,200
3,Punch,2022,250
4,Fortuner,2018,250


In [3]:
print(myvar.loc['Car2'])

car          Swift
YoM           2015
Top Speed      200
Name: Car2, dtype: object


In [4]:
print(myvar.loc[['Car1', 'Car3']])

          car   YoM  Top Speed
Car1   Baleno  2017        150
Car3  Harrier  2020        200


# Series

A Pandas Series is like a column in a table.
It is a one-dimensional array holding data of any type.

In [5]:
a = [9, 8, 7, 6, 5]
pd.Series(a, index=['a', 'b','c', 'd', 'e'])

a    9
b    8
c    7
d    6
e    5
dtype: int64

In [6]:
days = {1 : 'Sunday', 2 : 'Monday', 3 : 'Tuesday', 4 : 'Wednesday', 5 : 'Thursday', 6 : 'Friday', 7 : 'Saturday'}
pd.Series(days)

1       Sunday
2       Monday
3      Tuesday
4    Wednesday
5     Thursday
6       Friday
7     Saturday
dtype: object

In [7]:
pd.Series(days, index=[2, 3, 4, 5, 6])

2       Monday
3      Tuesday
4    Wednesday
5     Thursday
6       Friday
dtype: object

# Pandas Read CSV

A simple way to store big data sets is to use CSV files (comma separated files).

In [8]:

df = pd.read_csv('/home/developer/Desktop/data.csv')
# If you have a large DataFrame with many rows, Pandas will only return the first 5 rows, and the last 5 rows
print(df)

   Sno.          Name   Age   Salary       DoB
0     1  Ashish Yadav  22.0  15000.0  26/04/02
1     2          Anil  23.0  10000.0  20/02/02
2     3         Aswan  21.0  20000.0  27/05/02
3     4       Harshit   NaN  15000.0       NaN
4     5       Sandeep  25.0      NaN  09/02/96
5     6          Ajay  26.0      NaN  28/02/98
6     7       Bhawesh  23.0  20000.0    110811
7     8           NaN  30.0  80000.0  19981205


In [9]:
print(df.to_string())    # to_string() is used to print the entire DataFrame

   Sno.          Name   Age   Salary       DoB
0     1  Ashish Yadav  22.0  15000.0  26/04/02
1     2          Anil  23.0  10000.0  20/02/02
2     3         Aswan  21.0  20000.0  27/05/02
3     4       Harshit   NaN  15000.0       NaN
4     5       Sandeep  25.0      NaN  09/02/96
5     6          Ajay  26.0      NaN  28/02/98
6     7       Bhawesh  23.0  20000.0    110811
7     8           NaN  30.0  80000.0  19981205


The number of rows returned is defined in Pandas option settings.
You can check your system's maximum rows with the pd.options.display.max_rows statement.

In [10]:
print(pd.options.display.max_rows)
pd.options.display.max_rows = 9999

60


# Pandas Read JSON
Big data sets are often stored, or extracted as JSON.
JSON is plain text, but has the format of an object, and is well known in the world of programming, including Pandas.
JSON is equal to the Python Dictionary
JSON objects have the same format as Python dictionaries.

In [11]:
# data = pd.read_json("file.json")
# print(data.to_string())

In [12]:
json_df = {
  "Duration":{"0":60, "1":60, "2":60, "3":45, "4":45, "5":60},
  "Pulse":{"0":110, "1":117, "2":103, "3":109, "4":117, "5":102},
  "Maxpulse":{"0":130, "1":145, "2":135, "3":175, "4":148, "5":127},
  "Calories":{"0":409, "1":479, "2":340, "3":282, "4":406, "5":300}
}
jsondf = pd.DataFrame(json_df)
print(jsondf)

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


In [13]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sno.    8 non-null      int64  
 1   Name    7 non-null      object 
 2   Age     7 non-null      float64
 3   Salary  6 non-null      float64
 4   DoB     7 non-null      object 
dtypes: float64(2), int64(1), object(2)
memory usage: 448.0+ bytes
None


# Data Cleaning
Data cleaning means fixing bad data in your data set.
Bad data could be:
Empty cells,
Data in wrong format,
Wrong data,
Duplicates,etc.

In [14]:
#print(df)
new_df = df.dropna()
print(new_df)
# df['Name'].fillna('?', inplace=True)  # inplace=True modifies the original data file
print(df)

   Sno.          Name   Age   Salary       DoB
0     1  Ashish Yadav  22.0  15000.0  26/04/02
1     2          Anil  23.0  10000.0  20/02/02
2     3         Aswan  21.0  20000.0  27/05/02
6     7       Bhawesh  23.0  20000.0    110811
   Sno.          Name   Age   Salary       DoB
0     1  Ashish Yadav  22.0  15000.0  26/04/02
1     2          Anil  23.0  10000.0  20/02/02
2     3         Aswan  21.0  20000.0  27/05/02
3     4       Harshit   NaN  15000.0       NaN
4     5       Sandeep  25.0      NaN  09/02/96
5     6          Ajay  26.0      NaN  28/02/98
6     7       Bhawesh  23.0  20000.0    110811
7     8           NaN  30.0  80000.0  19981205


In [15]:
age_avg = df['Age'].median()
df['Age'].fillna(age_avg, inplace=True)
salary_avg = df['Salary'].mean()
df['Salary'].fillna(salary_avg, inplace=True)
df

Unnamed: 0,Sno.,Name,Age,Salary,DoB
0,1,Ashish Yadav,22.0,15000.0,26/04/02
1,2,Anil,23.0,10000.0,20/02/02
2,3,Aswan,21.0,20000.0,27/05/02
3,4,Harshit,23.0,15000.0,
4,5,Sandeep,25.0,26666.666667,09/02/96
5,6,Ajay,26.0,26666.666667,28/02/98
6,7,Bhawesh,23.0,20000.0,110811
7,8,,30.0,80000.0,19981205


# Pandas - Cleaning Data of Wrong Format
Cells with data of wrong format can make it difficult, or even impossible, to analyze data.
To fix it, you have two options: remove the rows, or convert all cells in the columns into the same format.

In [16]:
df['DoB'] = pd.to_datetime(df['DoB'])
print(df)
df.dropna(subset=['DoB'], inplace=True)
# print(df)

   Sno.          Name   Age        Salary        DoB
0     1  Ashish Yadav  22.0  15000.000000 2002-04-26
1     2          Anil  23.0  10000.000000 2002-02-20
2     3         Aswan  21.0  20000.000000 2002-05-27
3     4       Harshit  23.0  15000.000000        NaT
4     5       Sandeep  25.0  26666.666667 1996-09-02
5     6          Ajay  26.0  26666.666667 1998-02-28
6     7       Bhawesh  23.0  20000.000000 2011-11-08
7     8           NaN  30.0  80000.000000 1998-12-05


  df['DoB'] = pd.to_datetime(df['DoB'])


In [17]:
data = pd.read_csv('data.csv')
y = data['Date'].mode()
data['Date'].fillna(y[0], inplace=True)
data['Date'] = pd.to_datetime(data['Date'],format='mixed')
print(data)

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7        450 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
12        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90 

# Pandas - Cleaning Wrong data
"Wrong data" does not have to be "empty cells" or "wrong format", it can just be wrong, like if someone registered "199" instead of "1.99".
Sometimes you can spot wrong data by looking at the data set, because you have an expectation of what it should be.

In [18]:
data.loc[7, 'Duration'] = 45
data.loc[22, 'Date'] = y[0]

In [19]:
for x in data.index:
    if data.loc[x, 'Maxpulse'] > 150:
        data.loc[x, 'Maxpulse'] = 150
# data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Duration  32 non-null     int64         
 1   Date      32 non-null     datetime64[ns]
 2   Pulse     32 non-null     int64         
 3   Maxpulse  32 non-null     int64         
 4   Calories  30 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 KB


In [20]:
for x in data.index:
    if data.loc[x, 'Pulse'] < 100:
        data.drop(x, inplace=True)