# Data Science Process
1. Define the Problem
2. Data Collection
3. EDA (Exploratory Data Analysis)
4. Data Cleaning and Pre-Processing
5. Feature Engineering
6. Modelling
7. Evaluation
8. Deployment

# Exploratory Data Analysis 
1. Load and Understand the structure of the data
2. Clean the data
3. Get the statistical summaries
4. Data Visualization

<b>Pandas are used for EDA.</b>
<b>MatplotLib, Seaborn are used for Data Visualization.</b>

<b>Pandas</b>: Data analysis and manipulation tool.

In [7]:
import pandas as pd

In [8]:
# Pandas -> Series, DataFrames
# Series -> 1-Dimensional labelled array
# DataFrame -> 2-Dimension Labelled array (Like a table)

info = {
    "Name": ["Gojo", "Toji", "Adi"],
    "Marks": [95, 78, 99]
}

df = pd.DataFrame(info)
print(df)

# The Dictionary Keys are the Headers of the DataFrame.
# 0, 1 and 2 are the labels in the DataFrame.

   Name  Marks
0  Gojo     95
1  Toji     78
2   Adi     99


In [9]:
df

Unnamed: 0,Name,Marks
0,Gojo,95
1,Toji,78
2,Adi,99


In [10]:
# Series - 1D Labelled Array
s = pd.Series([1, 2, 3, 4, 5])
print(s)
print(type(s))

# Index
print(s[0])
print(s[2])

# Index and labels are not the same. We can also have custom labels.

0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>
1
3


In [11]:
# Adding a custom label
s = pd.Series([22, 23, 24, 25, 26], index = ["Adi", "Gojo", "Toji", "Nanami", "Sukuna"])
print(s)

# Accessing values using labels
print(s["Adi"])
print(s["Gojo"])

# Printing all the labels 
print(s.index)

Adi       22
Gojo      23
Toji      24
Nanami    25
Sukuna    26
dtype: int64
22
23
Index(['Adi', 'Gojo', 'Toji', 'Nanami', 'Sukuna'], dtype='object')


# Properties of Series:
- Homogenous
- Vectorized Operations
- Handle Missing values with NaN
- Mutable values, immutable size

In [15]:
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])

print(s1 + s2)

s1[0] = 100
changed_s1 = s1.drop(0)

print(s1)
print(changed_s1)

0    11
1    22
2    33
3    44
4    55
dtype: int64
0    100
1      2
2      3
3      4
4      5
dtype: int64
1    2
2    3
3    4
4    5
dtype: int64


In [17]:
# DataFrame (2D Labelled Array)

info = {
    "Name": ["Adam", "Eve", "Bob"],
    "Age": [23, 24, 25],
    "GPA": [9.5, 8.6, 7.2]
}

df = pd.DataFrame(info)
print(df) 
print(df.index)

   Name  Age  GPA
0  Adam   23  9.5
1   Eve   24  8.6
2   Bob   25  7.2
RangeIndex(start=0, stop=3, step=1)


In [22]:
# Lists of Lists 
df = pd.DataFrame([["Adam", 23], ["Bob", 23], ["Eve", 23]], columns = ["Name", "Age"])
print(df)

   Name  Age
0  Adam   23
1   Bob   23
2   Eve   23


In [23]:
# Using NumPy Arrays to Create a Frame

import numpy as np
np_arr = np.array([[1, 2, 3], [4, 5, 6]])

df = pd.DataFrame(np_arr, columns = ["A", "B", "C"])
print(df)

   A  B  C
0  1  2  3
1  4  5  6


In [24]:
# Usage with CSV

df = pd.read_csv("employee_data.csv")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [26]:
# JSON Data

df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


# Pandas (DataFrame Methods)
- df.head()
- df.tail()
- df.sample()
- df.info()
- df.shape
- df.describe()
- df.columns
- df.nunique()

In [27]:
df = pd.read_json("employee_data.json")
print(df, type(df))

   ID     Name  Age Department  Salary
0   1    Alice   25         HR   55000
1   2      Bob   32         IT   72000
2   3  Charlie   28    Finance   48000
3   4    David   45  Marketing   91000
4   5      Eva   38         IT   65000
5   6    Frank   29    Finance   50000
6   7    Grace   41         HR   82000
7   8   Hannah   26  Marketing   47000
8   9      Ian   35         IT   75000
9  10    Julia   30    Finance   60000 <class 'pandas.core.frame.DataFrame'>


In [43]:
# Methods

df.head() # First 5 Rows
df.tail() # Last 5 Rows

df.sample() # Just a random row from the table

df.info() # Summary of the data

df.shape # Returns a tuple of rows and 

df.describe() # Gives detailed statistical summaries for numerical value tables

df.columns # All the columns in the table

df.nunique() # Gives unique rows
'''
ID            10
Name          10
Age           10
Department     4
Salary        10
dtype: int64
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          10 non-null     int64 
 1   Name        10 non-null     object
 2   Age         10 non-null     int64 
 3   Department  10 non-null     object
 4   Salary      10 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 532.0+ bytes


ID            10
Name          10
Age           10
Department     4
Salary        10
dtype: int64

In [32]:
df

Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,55000
1,2,Bob,32,IT,72000
2,3,Charlie,28,Finance,48000
3,4,David,45,Marketing,91000
4,5,Eva,38,IT,65000
5,6,Frank,29,Finance,50000
6,7,Grace,41,HR,82000
7,8,Hannah,26,Marketing,47000
8,9,Ian,35,IT,75000
9,10,Julia,30,Finance,60000
