In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV

from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import AdaBoostRegressor

from sklearn.feature_extraction.text import CountVectorizer

from google.colab import drive, files

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = pd.read_csv('drive/MyDrive/capstone_project/train_cleaned.csv')

In [None]:
train.head()

Unnamed: 0,job_description,job_desig,key_skills,location,min_experience,max_experience,average_salary
0,exp minimum 5 year good understanding of ioc r...,Senior Exploit and Vulnerability Researcher,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),5,7,11.0
1,he should have handled a team of atleast 5 6 d...,Head SCM,"ppc, logistics, inventory management, supply c...",Sonepat,10,17,17.5
2,must be an effective communicator written spok...,Deputy Manager - Talent Management & Leadershi...,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,5,9,27.5
3,7 10 year of overall experience in data engine...,Associate Manager Data Engineering,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,7,10,17.5
4,chartered accountancy degree or mba in finance...,TS- GSA- Senior Analyst,"accounting, finance, cash flow, financial plan...",Gurgaon,1,3,6.0


In [None]:
train['cv_data'] = train["job_description"] + train['job_desig'] + train['key_skills'] + train['location']

In [None]:
X = train['cv_data']
y = train['average_salary']

In [None]:
y.value_counts(normalize=True)

17.5    0.227261
27.5    0.208323
11.0    0.178375
1.5     0.163881
6.0     0.142619
50.0    0.079541
Name: average_salary, dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)), 
    ('lin', LinearRegression())
])

In [None]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.925134376626968, -1.2022971613412703)

In [None]:
pipe2 = Pipeline([
    ('cv', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)), 
    ('knn', KNeighborsRegressor())
])

In [None]:
pipe2.fit(X_train, y_train)
pipe2.score(X_train, y_train), pipe2.score(X_test, y_test)

(0.49519721015876095, 0.22020185867511566)

In [None]:
# pipe3 = Pipeline([
#     ('cv', CountVectorizer()), 
#     ('lassoCV', LassoCV())
# ])

In [None]:
# pipe3.fit(X_train, y_train)
# pipe3.score(X_train, y_train), pipe3.score(X_test, y_test)

In [None]:
pipe4 = Pipeline([
    ('cv', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)), 
    ('rfr', RandomForestRegressor())
])

In [None]:
pipe4.fit(X_train, y_train)
pipe4.score(X_train, y_train), pipe4.score(X_test, y_test)

(0.8711679757892901, 0.43489613037297015)

In [None]:
pipe5 = Pipeline([
    ('cv', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)), 
    ('abr', AdaBoostRegressor())
])

In [None]:
pipe5.fit(X_train, y_train)
pipe5.score(X_train, y_train), pipe5.score(X_test, y_test)

(0.07643019795096173, 0.05837231027415368)

In [None]:
results = {'Model': ['LinearRegression',
                     'KNeighborsRegressor',
                     #'LassoCV',
                     'RandomForestRegressor',
                     'AdaBoostRegressor'],
           'train_score': [pipe.score(X_train, y_train),
                           pipe2.score(X_train, y_train),
                           #pipe3.score(X_train, y_train), 
                           pipe4.score(X_train, y_train), 
                           pipe5.score(X_train, y_train)],
           'test_score': [pipe.score(X_test, y_test),
                           pipe2.score(X_test, y_test),
                           #pipe3.score(X_test, y_test), 
                           pipe4.score(X_test, y_test), 
                           pipe5.score(X_test, y_test)]
          }

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df

Unnamed: 0,Model,train_score,test_score
0,LinearRegression,0.925134,-1.202297
1,KNeighborsRegressor,0.495197,0.220202
2,RandomForestRegressor,0.871168,0.434896
3,AdaBoostRegressor,0.07643,0.058372
