### Perform various data preprocessing techniques like handling missing data and feature scaling.

#### step 1: Start by importing the necessary Python libraries for data preprocessing.


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#### Step 2: Load the placement dataset into a Pandas Dataframe.

In [7]:
data = pd.read_csv("data.csv",index_col=0)
df = pd.DataFrame(data)
df.head()

Unnamed: 0_level_0,gender,hsc_p,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,salary
sl_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M,91.0,Commerce,58.0,Sci&Tech,55.0,Mkt&HR,58.8,270000.0
2,M,78.33,Science,77.48,Sci&Tech,86.5,Mkt&Fin,66.28,200000.0
3,M,,Arts,64.0,Comm&Mgmt,75.0,Mkt&Fin,57.8,250000.0
4,M,52.0,Science,,Sci&Tech,66.0,Mkt&HR,59.43,
5,M,73.6,Commerce,73.3,Comm&Mgmt,96.8,Mkt&Fin,55.5,425000.0


#### Step 3:Take a quick look at the data to understand its structure and identify any missing values or anomalies.

In [8]:
print(df)
print()
df.info()
print()
df.shape

      gender  hsc_p     hsc_s  degree_p   degree_t  etest_p specialisation  \
sl_no                                                                        
1          M  91.00  Commerce     58.00   Sci&Tech     55.0         Mkt&HR   
2          M  78.33   Science     77.48   Sci&Tech     86.5        Mkt&Fin   
3          M    NaN      Arts     64.00  Comm&Mgmt     75.0        Mkt&Fin   
4          M  52.00   Science       NaN   Sci&Tech     66.0         Mkt&HR   
5          M  73.60  Commerce     73.30  Comm&Mgmt     96.8        Mkt&Fin   
...      ...    ...       ...       ...        ...      ...            ...   
211        M  82.00  Commerce     77.60  Comm&Mgmt     91.0        Mkt&Fin   
212        M  60.00   Science     72.00   Sci&Tech     74.0        Mkt&Fin   
213        M  67.00  Commerce     73.00  Comm&Mgmt     59.0        Mkt&Fin   
214        F  66.00  Commerce     58.00  Comm&Mgmt     70.0         Mkt&HR   
215        M  58.00   Science     53.00  Comm&Mgmt     89.0     

(215, 9)

#### The method isnull() checks each element in the DataFrame (or Series) to see if it is NaN (Not a Number) or None (missing value).
It returns a DataFrame (or Series) of the same shape as the input, with Boolean values:
#### True: The value is null (NaN or None).
#### False: The value is not null.

In [9]:
print(df.isnull())
df.isnull().sum()

       gender  hsc_p  hsc_s  degree_p  degree_t  etest_p  specialisation  \
sl_no                                                                      
1       False  False  False     False     False    False           False   
2       False  False  False     False     False    False           False   
3       False   True  False     False     False    False           False   
4       False  False  False      True     False    False           False   
5       False  False  False     False     False    False           False   
...       ...    ...    ...       ...       ...      ...             ...   
211     False  False  False     False     False    False           False   
212     False  False  False     False     False    False           False   
213     False  False  False     False     False    False           False   
214     False  False  False     False     False    False           False   
215     False  False  False     False     False    False           False   

       mba_

gender             0
hsc_p              5
hsc_s              0
degree_p           2
degree_t           0
etest_p            4
specialisation     0
mba_p              1
salary            67
dtype: int64

#### Step 4: Handle Missing Data
#### Option 1: If the dataset is large and only a small percentage of data is missing, you can remove rows with missing values using dropna(subset,inplace)


In [10]:
print(df.dropna(subset=["salary"],inplace=False))
df.dropna(subset=["salary"],inplace=True)

      gender  hsc_p     hsc_s  degree_p   degree_t  etest_p specialisation  \
sl_no                                                                        
1          M  91.00  Commerce     58.00   Sci&Tech     55.0         Mkt&HR   
2          M  78.33   Science     77.48   Sci&Tech     86.5        Mkt&Fin   
3          M    NaN      Arts     64.00  Comm&Mgmt     75.0        Mkt&Fin   
5          M  73.60  Commerce     73.30  Comm&Mgmt     96.8        Mkt&Fin   
8          M  64.00   Science     66.00   Sci&Tech     67.0        Mkt&Fin   
...      ...    ...       ...       ...        ...      ...            ...   
210        M  72.00  Commerce     65.00  Comm&Mgmt     67.0        Mkt&Fin   
211        M  82.00  Commerce     77.60  Comm&Mgmt     91.0        Mkt&Fin   
212        M  60.00   Science     72.00   Sci&Tech     74.0        Mkt&Fin   
213        M  67.00  Commerce     73.00  Comm&Mgmt     59.0        Mkt&Fin   
214        F  66.00  Commerce     58.00  Comm&Mgmt     70.0     

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 1 to 214
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          148 non-null    object 
 1   hsc_p           146 non-null    float64
 2   hsc_s           148 non-null    object 
 3   degree_p        147 non-null    float64
 4   degree_t        148 non-null    object 
 5   etest_p         146 non-null    float64
 6   specialisation  148 non-null    object 
 7   mba_p           148 non-null    float64
 8   salary          148 non-null    float64
dtypes: float64(5), object(4)
memory usage: 11.6+ KB


#### Option 2:If removing data isn't ideal, you can impute (df.[""].fillna(df[""].mean(),inplace)) missing values using methods like mean, median, or most frequent.

In [11]:
df["hsc_p"].fillna(df["hsc_p"].mean(), inplace=True)
df["etest_p"].fillna(df["etest_p"].mean(), inplace=True)
df["degree_p"].fillna(df["degree_p"].mean(), inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 1 to 214
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          148 non-null    object 
 1   hsc_p           148 non-null    float64
 2   hsc_s           148 non-null    object 
 3   degree_p        148 non-null    float64
 4   degree_t        148 non-null    object 
 5   etest_p         148 non-null    float64
 6   specialisation  148 non-null    object 
 7   mba_p           148 non-null    float64
 8   salary          148 non-null    float64
dtypes: float64(5), object(4)
memory usage: 11.6+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["hsc_p"].fillna(df["hsc_p"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["etest_p"].fillna(df["etest_p"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

#### Step 5: Feature Scaling


<img src="https://i.postimg.cc/G21gMYnF/f.png" alt="Image Description" width="500">









 Option 1: This method scales the data to have a mean of 0 and a standard deviation of 1.
### StandardScaler()

In [12]:
c=["hsc_p","degree_p","etest_p","salary"]
sc1=StandardScaler()
df[c]=sc1.fit_transform(df[c])
df.head

<bound method NDFrame.head of       gender         hsc_p     hsc_s  degree_p   degree_t   etest_p  \
sl_no                                                                 
1          M  2.265997e+00  Commerce -1.652293   Sci&Tech -1.328518   
2          M  8.987875e-01   Science  1.346845   Sci&Tech  0.978332   
3          M  1.533482e-15      Arts -0.728534  Comm&Mgmt  0.136149   
5          M  3.883770e-01  Commerce  0.703293  Comm&Mgmt  1.732635   
8          M -6.475513e-01   Science -0.420614   Sci&Tech -0.449718   
...      ...           ...       ...       ...        ...       ...   
210        M  2.157223e-01  Commerce -0.574574  Comm&Mgmt -0.449718   
211        M  1.294814e+00  Commerce  1.365320  Comm&Mgmt  1.307882   
212        M -1.079188e+00   Science  0.503145   Sci&Tech  0.062915   
213        M -3.238237e-01  Commerce  0.657105  Comm&Mgmt -1.035584   
214        F -4.317329e-01  Commerce -1.652293  Comm&Mgmt -0.230018   

      specialisation  mba_p    salary  
sl_no 

#### Option 2:This method scales the data to a fixed range, usually between 0 and 1. 
###  MinMaxScaler()

In [13]:
c=["hsc_p","degree_p","etest_p","salary"]
sc1=MinMaxScaler()
df[c]=sc1.fit_transform(df[c])
df.head

<bound method NDFrame.head of       gender     hsc_p     hsc_s  degree_p   degree_t   etest_p  \
sl_no                                                             
1          M  0.857051  Commerce  0.057143   Sci&Tech  0.104167   
2          M  0.586729   Science  0.613714   Sci&Tech  0.760417   
3          M  0.409023      Arts  0.228571  Comm&Mgmt  0.520833   
5          M  0.485812  Commerce  0.494286  Comm&Mgmt  0.975000   
8          M  0.280990   Science  0.285714   Sci&Tech  0.354167   
...      ...       ...       ...       ...        ...       ...   
210        M  0.451675  Commerce  0.257143  Comm&Mgmt  0.354167   
211        M  0.665031  Commerce  0.617143  Comm&Mgmt  0.854167   
212        M  0.195648   Science  0.457143   Sci&Tech  0.500000   
213        M  0.344997  Commerce  0.485714  Comm&Mgmt  0.187500   
214        F  0.323661  Commerce  0.057143  Comm&Mgmt  0.416667   

      specialisation  mba_p    salary  
sl_no                                  
1             Mkt&

####  Step 6:Separate the dataset into features (X) and target (y) variables. The target is usually the column you want to predict.

In [16]:
x=df[["gender","hsc_p","degree_p","etest_p","specialisation","mba_p"]]
y=df[["salary"]]
y.head()

pandas.core.frame.DataFrame


### Step 7: After preprocessing, save the cleaned and scaled dataset to a new CSV file


In [17]:
final = pd.concat([x,y],axis=1)
final.to_csv("cleaned_data.csv",index=False)

In [12]:
# Lab-1 Activities

#Perform data preprocesing for Automobile.csv

#i. Delete the column horsepower since it has few missing values

#ii. Impute missing with meadin

#iii. Apply min-max scaling and standardization on the Automobiles.csv and provide the reasoning which feature scaling method make more sense to this dataset.