# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Problem Statement

Given a dataset which represents the parameters of an Engineering gradute based on which that salary will be given to him/her. You have to build a model to predict the Engineering graduate salary based on the provided features.

# Importing the dataset

Link: https://drive.google.com/file/d/13fgjGI3uGlnEqefpZ72JsWLqXqveNKo1/view

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ShapeAI DST 11021 Oct-Jan Batch 2021-22/Datasets/Engineering_graduate_salary.csv')

# EDA - Exploratory Data Analysis

In [None]:
df.head()

Unnamed: 0,ID,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,604399,f,1990-10-22,87.8,cbse,2009,84.0,cbse,6920,1,...,-1,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000
1,988334,m,1990-05-15,57.0,cbse,2010,64.5,cbse,6624,2,...,-1,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000
2,301647,m,1989-08-21,77.33,"maharashtra state board,pune",2007,85.17,amravati divisional board,9084,2,...,-1,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000
3,582313,m,1991-05-04,84.3,cbse,2009,86.0,cbse,8195,1,...,-1,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000
4,339001,f,1990-10-30,82.0,cbse,2008,75.0,cbse,4889,2,...,-1,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000


# Data Description
ID: A unique ID to identify a candidate

Salary: Annual CTC offered to the candidate (in INR)

Gender: Candidate's gender

DOB: Date of birth of the candidate

10percentage: Overall marks obtained in grade 10 examinations

10board: The school board whose curriculum the candidate followed in grade 10

12graduation: Year of graduation - senior year high school

12percentage: Overall marks obtained in grade 12 examinations

12board: The school board whose curriculum the candidate followed

CollegeID: Unique ID identifying the university/college which the candidate attended for her/his undergraduate

CollegeTier: Each college has been annotated as 1 or 2. The annotations have been computed from the average AMCAT scores obtained by the students in the college/university. Colleges with an average score above a threshold are tagged as 1 and others as 2.

Degree: Degree obtained/pursued by the candidate

Specialization: Specialization pursued by the candidate

CollegeGPA: Aggregate GPA at graduation

CollegeCityID: A unique ID to identify the city in which the college is located in.

CollegeCityTier: The tier of the city in which the college is located in. This is annotated based on the population of the cities.

CollegeState: Name of the state in which the college is located

GraduationYear: Year of graduation (Bachelor's degree)

English: Scores in AMCAT English section

Logical: Score in AMCAT Logical ability section

Quant: Score in AMCAT's Quantitative ability section

Domain: Scores in AMCAT's domain module

ComputerProgramming: Score in AMCAT's Computer programming section

ElectronicsAndSemicon: Score in AMCAT's Electronics & Semiconductor Engineering section

ComputerScience: Score in AMCAT's Computer Science section

MechanicalEngg: Score in AMCAT's Mechanical Engineering section

ElectricalEngg: Score in AMCAT's Electrical Engineering section

TelecomEngg: Score in AMCAT's Telecommunication Engineering section

CivilEngg: Score in AMCAT's Civil Engineering section

conscientiousness: Scores in one of the sections of AMCAT's personality test

agreeableness: Scores in one of the sections of AMCAT's personality test

extraversion: Scores in one of the sections of AMCAT's personality test

nueroticism: Scores in one of the sections of AMCAT's personality test

openesstoexperience: Scores in one of the sections of AMCAT's personality test


In [None]:
df.tail()

Unnamed: 0,ID,Gender,DOB,10percentage,10board,12graduation,12percentage,12board,CollegeID,CollegeTier,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
2993,103174,f,1989-04-17,75.0,0,2005,73.0,0,1263,2,...,-1,-1,-1,-1,-1.1901,0.9688,-1.0697,1.3549,0.0284,120000
2994,352811,f,1991-07-22,84.0,state board,2008,77.0,state board,9481,2,...,-1,-1,-1,-1,-0.1082,0.0328,-0.4891,-0.2902,0.5024,120000
2995,287070,m,1988-11-24,91.4,bsemp,2006,65.56,bsemp,547,2,...,-1,-1,-1,-1,-0.881,0.1888,-0.344,0.0623,0.6603,385000
2996,317336,m,1988-08-25,88.64,karnataka education board,2006,65.16,karnataka education board,1629,2,...,-1,-1,-1,-1,1.4374,1.2808,-0.4891,-1.46537,0.5419,530000
2997,993701,m,1992-05-27,77.0,state board,2009,75.5,state board,1111,2,...,-1,-1,-1,-1,-0.5899,-1.9521,0.3174,1.1601,-2.3937,200000


In [None]:
df.shape

(2998, 34)

In [None]:
df.columns

Index(['ID', 'Gender', 'DOB', '10percentage', '10board', '12graduation',
       '12percentage', '12board', 'CollegeID', 'CollegeTier', 'Degree',
       'Specialization', 'collegeGPA', 'CollegeCityID', 'CollegeCityTier',
       'CollegeState', 'GraduationYear', 'English', 'Logical', 'Quant',
       'Domain', 'ComputerProgramming', 'ElectronicsAndSemicon',
       'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg',
       'CivilEngg', 'conscientiousness', 'agreeableness', 'extraversion',
       'nueroticism', 'openess_to_experience', 'Salary'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     2998 non-null   int64  
 1   Gender                 2998 non-null   object 
 2   DOB                    2998 non-null   object 
 3   10percentage           2998 non-null   float64
 4   10board                2998 non-null   object 
 5   12graduation           2998 non-null   int64  
 6   12percentage           2998 non-null   float64
 7   12board                2998 non-null   object 
 8   CollegeID              2998 non-null   int64  
 9   CollegeTier            2998 non-null   int64  
 10  Degree                 2998 non-null   object 
 11  Specialization         2998 non-null   object 
 12  collegeGPA             2998 non-null   float64
 13  CollegeCityID          2998 non-null   int64  
 14  CollegeCityTier        2998 non-null   int64  
 15  Coll

In [None]:
df.isna().sum()

ID                       0
Gender                   0
DOB                      0
10percentage             0
10board                  0
12graduation             0
12percentage             0
12board                  0
CollegeID                0
CollegeTier              0
Degree                   0
Specialization           0
collegeGPA               0
CollegeCityID            0
CollegeCityTier          0
CollegeState             0
GraduationYear           0
English                  0
Logical                  0
Quant                    0
Domain                   0
ComputerProgramming      0
ElectronicsAndSemicon    0
ComputerScience          0
MechanicalEngg           0
ElectricalEngg           0
TelecomEngg              0
CivilEngg                0
conscientiousness        0
agreeableness            0
extraversion             0
nueroticism              0
openess_to_experience    0
Salary                   0
dtype: int64

In [None]:
df.isna().sum().sum()

0

In [None]:
df.columns

Index(['ID', 'Gender', 'DOB', '10percentage', '10board', '12graduation',
       '12percentage', '12board', 'CollegeID', 'CollegeTier', 'Degree',
       'Specialization', 'collegeGPA', 'CollegeCityID', 'CollegeCityTier',
       'CollegeState', 'GraduationYear', 'English', 'Logical', 'Quant',
       'Domain', 'ComputerProgramming', 'ElectronicsAndSemicon',
       'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg',
       'CivilEngg', 'conscientiousness', 'agreeableness', 'extraversion',
       'nueroticism', 'openess_to_experience', 'Salary'],
      dtype='object')

In [None]:
# Removing the columns - ID, 10board, 12board, CollegeID, CollegeCityID, CollegeCityTier,
# CollegeCityTier, CollegeState

In [None]:
df.drop(
    ['ID', '10board', '12board', 'CollegeID', 'CollegeCityID', 'CollegeCityTier',  'CollegeCityTier', 'CollegeState'],
    axis=1,
    inplace=True)

In [None]:
# 34 -> (33 + 1) -> (25 + 1)

In [None]:
df.head(1)

Unnamed: 0,Gender,DOB,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,f,1990-10-22,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,...,-1,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000


In [None]:
df.Gender.value_counts()

m    2282
f     716
Name: Gender, dtype: int64

In [None]:
enc = LabelEncoder()

In [None]:
df.Gender = enc.fit_transform(df.Gender)

In [None]:
df.Gender.value_counts()

1    2282
0     716
Name: Gender, dtype: int64

m -> 1

f -> 0

In [None]:
df.head(1)

Unnamed: 0,Gender,DOB,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,0,1990-10-22,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,...,-1,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000


In [None]:
df['10percentage'].describe()

count    2998.000000
mean       77.666264
std        10.002785
min        43.000000
25%        71.140000
50%        78.965000
75%        85.600000
max        97.760000
Name: 10percentage, dtype: float64

In [None]:
df['12graduation'].describe()

count    2998.000000
mean     2008.080720
std         1.631814
min      1998.000000
25%      2007.000000
50%      2008.000000
75%      2009.000000
max      2012.000000
Name: 12graduation, dtype: float64

In [None]:
df['12percentage'].describe()

count    2998.000000
mean       74.341061
std        11.120299
min        40.000000
25%        66.000000
50%        74.000000
75%        82.600000
max        98.700000
Name: 12percentage, dtype: float64

In [None]:
# Age -> 2018 - x -> 2018 - 1990

In [None]:
# GraduationYear

In [None]:
df['GraduationYear'].describe()

count    2998.000000
mean     2011.939960
std        36.780582
min         0.000000
25%      2012.000000
50%      2013.000000
75%      2014.000000
max      2017.000000
Name: GraduationYear, dtype: float64

In [None]:
newdf = df.copy()

In [None]:
newdf['DOB'] = pd.to_datetime(newdf['DOB'])

In [None]:
# 01-12-2015

In [None]:
newdf.head(1)

Unnamed: 0,Gender,DOB,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,0,1990-10-22,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,...,-1,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000


In [None]:
newdf.DOB[0].year

1990

In [None]:
newdf.DOB.max().year

1997

In [None]:
newdf.DOB.min().year

1981

In [None]:
df['DOB'] = pd.to_datetime(df['DOB'])

In [None]:
df.head()

Unnamed: 0,Gender,DOB,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,0,1990-10-22,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,...,-1,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000
1,1,1990-05-15,57.0,2010,64.5,2,B.Tech/B.E.,computer science & engineering,65.0,2014,...,-1,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000
2,1,1989-08-21,77.33,2007,85.17,2,B.Tech/B.E.,electronics & telecommunications,61.94,2011,...,-1,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000
3,1,1991-05-04,84.3,2009,86.0,1,B.Tech/B.E.,computer science & engineering,80.4,2013,...,-1,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000
4,0,1990-10-30,82.0,2008,75.0,2,B.Tech/B.E.,biotechnology,64.3,2012,...,-1,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000


In [None]:
for date in df['DOB']:
  print(2018 - date.year)

28
28
29
27
28
29
29
27
26
25
28
26
27
26
27
28
30
26
27
27
31
26
32
30
27
28
26
30
28
29
26
26
28
28
27
27
27
26
27
30
28
27
32
27
30
27
27
34
28
28
28
28
28
25
25
28
27
28
27
31
26
26
26
26
27
29
27
28
27
25
27
28
27
26
29
25
26
32
26
29
27
29
28
26
29
27
27
26
28
28
26
28
28
31
26
28
25
35
27
27
28
27
25
27
26
31
26
29
28
25
29
30
31
28
26
30
28
33
27
29
26
27
26
26
30
30
27
25
30
25
29
28
31
25
27
28
27
28
28
28
31
28
28
29
25
27
25
28
26
28
26
28
28
26
30
29
28
28
28
26
29
29
26
26
26
26
26
29
30
25
28
29
31
32
27
27
27
26
28
28
27
25
30
25
26
28
26
30
27
28
27
29
27
28
29
27
29
31
26
27
31
26
25
30
25
26
27
25
30
25
27
25
28
26
26
27
30
27
26
26
27
27
29
27
26
25
25
30
30
26
30
27
28
33
26
27
26
26
30
28
28
30
26
26
27
26
29
27
26
27
28
29
30
27
27
25
32
29
28
25
27
26
31
26
28
30
26
25
27
28
31
27
28
26
26
27
25
25
26
30
27
29
28
25
28
27
28
30
26
26
27
26
28
27
26
25
29
25
29
28
28
30
29
25
27
27
26
29
27
27
23
27
26
26
26
27
28
25
30
27
29
27
27
27
26
30
27
28
31
27
27
26
29
2

In [None]:
age = []

In [None]:
for date in df['DOB']:
  age.append(2018 - int(date.year))

df['Age'] = age

In [None]:
df.head()

Unnamed: 0,Gender,DOB,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,...,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary,Age
0,0,1990-10-22,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,...,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000,28
1,1,1990-05-15,57.0,2010,64.5,2,B.Tech/B.E.,computer science & engineering,65.0,2014,...,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000,28
2,1,1989-08-21,77.33,2007,85.17,2,B.Tech/B.E.,electronics & telecommunications,61.94,2011,...,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000,29
3,1,1991-05-04,84.3,2009,86.0,1,B.Tech/B.E.,computer science & engineering,80.4,2013,...,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000,27
4,0,1990-10-30,82.0,2008,75.0,2,B.Tech/B.E.,biotechnology,64.3,2012,...,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000,28


In [None]:
df.drop('DOB', axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,Gender,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,English,...,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary,Age
0,0,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,650,...,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000,28
1,1,57.0,2010,64.5,2,B.Tech/B.E.,computer science & engineering,65.0,2014,440,...,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000,28
2,1,77.33,2007,85.17,2,B.Tech/B.E.,electronics & telecommunications,61.94,2011,485,...,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000,29
3,1,84.3,2009,86.0,1,B.Tech/B.E.,computer science & engineering,80.4,2013,675,...,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000,27
4,0,82.0,2008,75.0,2,B.Tech/B.E.,biotechnology,64.3,2012,575,...,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000,28


In [None]:
df.CollegeTier.unique()

array([1, 2])

In [None]:
df.CollegeTier.value_counts()

2    2772
1     226
Name: CollegeTier, dtype: int64

In [None]:
df.shape[0]

2998

In [None]:
2772/2998

0.9246164109406271

In [None]:
80% - Train
20% - Test

In [None]:
df.GraduationYear.describe()

count    2998.000000
mean     2011.939960
std        36.780582
min         0.000000
25%      2012.000000
50%      2013.000000
75%      2014.000000
max      2017.000000
Name: GraduationYear, dtype: float64

In [None]:
df.head()

Unnamed: 0,Gender,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,English,...,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary,Age
0,0,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,650,...,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000,28
1,1,57.0,2010,64.5,2,B.Tech/B.E.,computer science & engineering,65.0,2014,440,...,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000,28
2,1,77.33,2007,85.17,2,B.Tech/B.E.,electronics & telecommunications,61.94,2011,485,...,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000,29
3,1,84.3,2009,86.0,1,B.Tech/B.E.,computer science & engineering,80.4,2013,675,...,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000,27
4,0,82.0,2008,75.0,2,B.Tech/B.E.,biotechnology,64.3,2012,575,...,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000,28


In [None]:
df.Degree.unique()

array(['B.Tech/B.E.', 'M.Tech./M.E.', 'MCA', 'M.Sc. (Tech.)'],
      dtype=object)

In [None]:
df.Degree.value_counts()

B.Tech/B.E.      2757
MCA               200
M.Tech./M.E.       40
M.Sc. (Tech.)       1
Name: Degree, dtype: int64

In [None]:
BTech/B.E. -> "Bachelors"
MCA, M.Tech./M.E., M.Sc. (Tech.) -> 'Masters'

In [None]:
df.Specialization.value_counts()

electronics and communication engineering      670
computer science & engineering                 557
information technology                         506
computer engineering                           415
computer application                           201
mechanical engineering                         155
electronics and electrical engineering         148
electronics & telecommunications                89
electrical engineering                          63
electronics & instrumentation eng               24
instrumentation and control engineering         18
information science engineering                 18
electronics and instrumentation engineering     18
civil engineering                               15
electronics engineering                         13
biotechnology                                   12
other                                           10
industrial & production engineering              8
chemical engineering                             7
applied electronics and instrum

In [None]:
df.head()

Unnamed: 0,Gender,10percentage,12graduation,12percentage,CollegeTier,Degree,Specialization,collegeGPA,GraduationYear,English,...,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary,Age
0,0,87.8,2009,84.0,1,B.Tech/B.E.,instrumentation and control engineering,73.82,2013,650,...,-1,-1,-1,-0.159,0.3789,1.2396,0.1459,0.2889,445000,28
1,1,57.0,2010,64.5,2,B.Tech/B.E.,computer science & engineering,65.0,2014,440,...,-1,-1,-1,1.1336,0.0459,1.2396,0.5262,-0.2859,110000,28
2,1,77.33,2007,85.17,2,B.Tech/B.E.,electronics & telecommunications,61.94,2011,485,...,-1,260,-1,0.51,-0.1232,1.5428,-0.2902,-0.2875,255000,29
3,1,84.3,2009,86.0,1,B.Tech/B.E.,computer science & engineering,80.4,2013,675,...,-1,-1,-1,-0.4463,0.2124,0.3174,0.2727,0.4805,420000,27
4,0,82.0,2008,75.0,2,B.Tech/B.E.,biotechnology,64.3,2012,575,...,-1,-1,-1,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000,28


In [None]:
df.collegeGPA.describe()

count    2998.000000
mean       71.509857
std         8.122462
min         6.630000
25%        66.530000
50%        71.800000
75%        76.300000
max        99.930000
Name: collegeGPA, dtype: float64

In [None]:
df.GraduationYear.describe()

count    2998.000000
mean     2011.939960
std        36.780582
min         0.000000
25%      2012.000000
50%      2013.000000
75%      2014.000000
max      2017.000000
Name: GraduationYear, dtype: float64

In [None]:
df.columns

Index(['Gender', '10percentage', '12graduation', '12percentage', 'CollegeTier',
       'Degree', 'Specialization', 'collegeGPA', 'GraduationYear', 'English',
       'Logical', 'Quant', 'Domain', 'ComputerProgramming',
       'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg',
       'ElectricalEngg', 'TelecomEngg', 'CivilEngg', 'conscientiousness',
       'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience',
       'Salary', 'Age'],
      dtype='object')

In [None]:
df.English.describe()

count    2998.000000
mean      501.066378
std       105.304512
min       180.000000
25%       425.000000
50%       500.000000
75%       570.000000
max       875.000000
Name: English, dtype: float64

In [None]:
df.Logical.describe()

count    2998.000000
mean      500.431621
std        87.299850
min       195.000000
25%       441.250000
50%       505.000000
75%       565.000000
max       795.000000
Name: Logical, dtype: float64

In [None]:
df.Quant.describe()

count    2998.000000
mean      514.137759
std       122.194955
min       120.000000
25%       430.000000
50%       515.000000
75%       595.000000
max       900.000000
Name: Quant, dtype: float64

In [None]:
df.Domain.describe()

count    2998.000000
mean        0.508458
std         0.463373
min        -1.000000
25%         0.342315
50%         0.622643
75%         0.835612
max         0.999910
Name: Domain, dtype: float64

In [None]:
-1 to 1

In [None]:
df.columns

Index(['Gender', '10percentage', '12graduation', '12percentage', 'CollegeTier',
       'Degree', 'Specialization', 'collegeGPA', 'GraduationYear', 'English',
       'Logical', 'Quant', 'Domain', 'ComputerProgramming',
       'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg',
       'ElectricalEngg', 'TelecomEngg', 'CivilEngg', 'conscientiousness',
       'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience',
       'Salary', 'Age'],
      dtype='object')

In [None]:
'ComputerProgramming', 'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg',
       'ElectricalEngg', 'TelecomEngg', 'CivilEngg'

In [None]:
CS    E CP ME EL TE C
900 -1 -1 -1 -1 -1 -1
-1. 900 900 -1 -1 -1 -1

In [None]:
DOMAIN
900
900

In [None]:
df['Domain']

In [None]:
model = SVR()

In [None]:
X = df.drop('Salary')
y = df.Salary

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
model.predict([[_,_,_,......,_]])