# *************** Salary Prediction Model ***************

#  Import Requied Libraries

In [109]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

%matplotlib inline

# Reads the dataset form CSV file into pandas Dataframe
- Assign the vaule of features of house from the dataset to X and y

In [110]:
saladata = pd.read_csv("Salary Prediction.csv")

X = saladata[['Age',	'Gender','Education Level','Job Title','Years of Experience']]
Y = saladata['Salary']
print(X)
print(Y)
saladata.head()

     Age  Gender Education Level                      Job Title   
0     32    Male      Bachelor's              Software Engineer  \
1     28  Female        Master's                   Data Analyst   
2     45    Male             PhD                 Senior Manager   
3     36  Female      Bachelor's                Sales Associate   
4     52    Male        Master's                       Director   
..   ...     ...             ...                            ...   
368   35  Female      Bachelor's       Senior Marketing Analyst   
369   43    Male        Master's         Director of Operations   
370   29  Female      Bachelor's         Junior Project Manager   
371   34    Male      Bachelor's  Senior Operations Coordinator   
372   44  Female             PhD        Senior Business Analyst   

     Years of Experience  
0                    5.0  
1                    3.0  
2                   15.0  
3                    7.0  
4                   20.0  
..                   ...  
368   

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32,Male,Bachelor's,Software Engineer,5.0,90000
1,28,Female,Master's,Data Analyst,3.0,65000
2,45,Male,PhD,Senior Manager,15.0,150000
3,36,Female,Bachelor's,Sales Associate,7.0,60000
4,52,Male,Master's,Director,20.0,200000


# Train-Test Split
- Splits the dataset into train and test sets.


In [111]:
X_train, X_test ,Y_train ,Y_test = train_test_split(X,Y,test_size=0.20,random_state=0)

# Preprocessing Pipeline
- Creat a preprocessing pipeline for numerical features (scaling)
- categorical features (one-hot encoding).

In [112]:
nu_features = ['Age','Years of Experience']
ca_features = ['Gender','Education Level','Job Title']

In [113]:
nu_transform = Pipeline(steps=[('scaler', StandardScaler())])
ca_transform = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [114]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',nu_transform,nu_features),
        ('cat',ca_transform,ca_features)
    ]
)

# Linear Regression Model
- Creates a pipeline that applies the preprocessing steps and then fits a linear regression model

In [115]:
model = Pipeline(steps=[('preprocessor',preprocessor),
                        ('regressor',LinearRegression())])

# Fit and Train the Model 
- Fits the model on the training X_train and Y_train that splits early data

In [116]:
model.fit(X_train,Y_train)

# Predictions and Model Evaluation
- Predicts the target variable on the test set
- Evaluates the model using the R-squared score on next line

In [117]:
y_pred = model.predict(X_test)

In [118]:
print('Coefficients :',model.named_steps['regressor'].coef_)

Coefficients : [ 1.50212945e+04  1.21881616e+04 -7.85019343e+02  7.85019343e+02
 -4.79251043e+03 -1.74434103e+03  6.53685146e+03 -1.00696408e+03
 -1.54450451e+04 -4.36101934e+04 -6.85358222e+03  9.81778146e+01
  9.84302896e+04  8.66160961e+04  8.46813399e+04 -1.14532005e+04
 -1.82654715e+04 -1.97572889e+04 -2.72998188e+04 -2.18943599e+04
 -1.41953695e+04 -3.15569571e+04 -1.52035250e+04  6.06555598e+03
  5.03817741e+04  3.99909738e+04  3.10671287e+04  3.65082881e+04
  2.52432581e+04  3.70824256e+04  2.97591450e+04  3.20752842e+04
  4.21960445e+04  2.42351026e+04  3.96686112e+03 -1.92729509e+04
  2.68729693e+04 -2.73198929e+04 -8.62956450e+03 -1.67735998e+04
 -1.18927875e+04 -1.61685018e+04 -2.82647956e+04 -1.70975078e+04
 -1.98181500e+04 -1.87731028e+04 -2.18879724e+04 -4.54294305e+04
 -2.14532007e+04 -2.26202180e+04 -2.82647956e+04 -1.31709963e+04
 -2.24613562e+04 -1.14913022e+04 -9.15219073e+03 -1.96135801e+04
 -2.04450452e+04 -2.06660051e+04 -1.49612897e+04 -1.89117464e+04
 -1.244125

# R2 Score
- prints the r2 score based on prediction and actual data 

In [119]:
r2 = r2_score(Y_test , y_pred)
print("R-squared score :" ,r2)

R-squared score : 0.8896576278656085


# Test the model on new data

In [120]:
new_data = pd.DataFrame({
    'Age' : [55,22],
    'Gender' : ['Male' , 'Male'],
    'Education Level': ["PhD", "Master's"],
    'Job Title': ['Senior Data Scientist','Senior Data Scientist'],
    'Years of Experience' : [1,50]
})

In [121]:
new_salary_prediction = model.predict(new_data)

In [122]:
print('Prediction of salary on new data')
print(new_salary_prediction)

Prediction of salary on new data
[153266.05056318 165344.71535717]
