In [None]:
#Importing sklearn packages, creating custom tokenizer, text preprocessor and salary prep

In [None]:
#Note there are limited features - Only region(referred to as location in the exercise),job_category,country are used as initial features

#Using summary variable with textual analysis to create additional features based on the Tfidf values

import nltk
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV

#Conducting analysis with one feature 'region'; dropping all other columns
columns_to_drop = ['job_title','company','location','salary','link','summary','exch_rate','pay_rate','lower','upper','median_salary']

tfidfmax_features = 10

#Creating a function to do custom tokenizers of string which are segmented and split
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    return words

#creating list of stopwords from english
stopwordz = stopwords.words('english')


#Creating text preprocessing for only job summary 
text_preprocessor = ('text', Pipeline([('colext', TextSelector('summary')),('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=list(set(stopwordz)),
                     min_df=.0025, max_df=0.9, max_features=tfidfmax_features,ngram_range=(1,3))), ]))

#Creating salary preprocessing for existing categorical variables
salary_preprocessor = ('nominal',Pipeline([('salary_prep', SalaryPreprocessor(columns_to_drop=columns_to_drop)),
                                           ('encoder',OneHotEncoder(categories = "auto",sparse=False,handle_unknown='ignore')),]))


In [None]:
#Writing function to conduct model comparison with pipelines
def model_compare(classifier1,classifier2,params1,params2):
    
    #Creating loop for gridsearch of both classifiers with one feature
    for clf,param in zip([classifier1,classifier2],[params1,params2]):
        grid_cv = GridSearchCV(clf,param_grid=param,cv=5,verbose=2,n_jobs=-1)
        grid_cv.fit(X_train,y_train)
        estimator_name = re.search(r"\.([A-Z]+.[A-Za-z]+)",str(grid_cv.get_params()['estimator__clf'].__class__)).group(1)
        print(end='\n')
        print(f"{estimator_name} cross-validated score:  {grid_cv.best_score_:.3f}",end='\n')
        print(grid_cv.best_params_)

In [None]:
#Conducting analysis with one feature 'region'; dropping all other columns
#columns_to_drop = ['job_title', 'company', 'location', 'salary',
       #'link', 'summary', 'country', 'exch_rate', 'pay_rate', 'lower', 'upper',
       #'median_salary']

#tfidfmax_features = 1

In [None]:
#Conducting analysis with one feature

#Defining classifier1 - Logistic Regression with one feature 'region'
classifier1 = Pipeline([
    ('features', FeatureUnion([text_preprocessor,salary_preprocessor])),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression())])

#Defining parameters for classifier1 for gridsearch
params1 = {'clf__C': np.logspace(-4,4),
 'clf__class_weight': [None],
 'clf__fit_intercept': [True,False],
 'clf__multi_class': ['auto'],
 'clf__penalty': ['l2','l1'],
 'clf__random_state': [0],
 'clf__solver': ['liblinear'],
 'clf__verbose': [2]}

In [None]:
#Defining classifier2 - DecisionTree with one feature 'region'
classifier2 = Pipeline([
    ('features', FeatureUnion([text_preprocessor,salary_preprocessor])),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', DecisionTreeClassifier())])

#Defining parameters for classifier2 for gridsearch
params2= {'clf__class_weight': [None],
 'clf__criterion': ['gini','entropy'],
 'clf__max_depth': list(range(2,20)),
 'clf__max_features': list(range(2,6))+[None],
 'clf__max_leaf_nodes': list(range(2,6))+[None],
 'clf__random_state': [0]}

In [None]:
#Creating loop for gridsearch of both classifiers with one feature
for clf,param in zip([classifier1,classifier2],[params1,params2]):
    grid_cv = GridSearchCV(clf,param_grid=param,cv=5,verbose=2,n_jobs=-1)
    grid_cv.fit(X_train,y_train)
    estimator_name = re.search(r"\.([A-Z]+.[A-Za-z]+)",str(grid_cv.get_params()['estimator__clf'].__class__)).group(1)
    print(end='\n')
    print(f"{estimator_name} cross-validated score:  {grid_cv.best_score_:.3f}",end='\n')
    print(grid_cv.best_params_)

In [None]:
#Defining classifier1 - Logistic Regression with textual features only
classifier1 = Pipeline([
    ('features', FeatureUnion([text_preprocessor])),
    ('clf', LogisticRegression(solver='lbfgs'))])

#Defining parameters for gridsearch
params1 = {'clf__C': np.logspace(-1,0,10),
 'clf__class_weight': [None],
 'clf__fit_intercept': [True],
 'clf__multi_class': ['auto'],
 'clf__penalty': ['l2','l1'],
 'clf__random_state': [0],
 'clf__solver': ['liblinear'],
 'clf__verbose': [2]}


#Defining classifier2 - DecisionTree with textual features only
classifier2 = Pipeline([
    ('features', FeatureUnion([text_preprocessor])),
    ('clf', DecisionTreeClassifier())])

params2= {'clf__class_weight': [None],
 'clf__criterion': ['gini'],
 'clf__max_depth': list(range(2,4)),
 'clf__max_features': [None],
 'clf__max_leaf_nodes': list(range(5,6))+[None],
 'clf__random_state': [0]}

In [None]:
#Including region, job level, 
columns_to_drop = ['job_title', 'company', 'location', 'salary',
       'link', 'summary', 'country', 'exch_rate', 'pay_rate', 'lower', 'upper',
       'median_salary','job_category','level']

tfidfmax_features = 10

In [None]:
for clf,param in zip([classifier1,classifier2],[params1,params2]):
    grid_cv = GridSearchCV(clf,param_grid=param,cv=5,verbose=2,n_jobs=-1)
    grid_cv.fit(X_train,y_train)
    estimator_name = re.search(r"\.([A-Z]+.[A-Za-z]+)",str(grid_cv.get_params()['estimator__clf'].__class__)).group(1)
    print(end='\n')
    print(f"{estimator_name} cross-validated score:  {grid_cv.best_score_:.3f}",end='\n')
    print(grid_cv.best_params_)

In [None]:
#Defining classifier1 - Logistic Regression with textual features only
classifier1 = Pipeline([
    ('features', FeatureUnion([text_preprocessor,salary_preprocessor])),
    ('clf', LogisticRegression(solver='lbfgs'))])

#Defining parameters for gridsearch
params1 = {'clf__C': np.logspace(-2,1),
 'clf__class_weight': [None],
 'clf__fit_intercept': [True],
 'clf__multi_class': ['auto'],
 'clf__penalty': ['l2','l1'],
 'clf__random_state': [0],
 'clf__solver': ['liblinear'],
 'clf__verbose': [2]}


#Defining classifier2 - DecisionTree with textual features only
classifier2 = Pipeline([
    ('features', FeatureUnion([text_preprocessor,salary_preprocessor])),
    ('clf', DecisionTreeClassifier())])

params2= {'clf__class_weight': [None],
 'clf__criterion': ['gini'],
 'clf__max_depth': list(range(2,20)),
 'clf__max_features': [None],
 'clf__max_leaf_nodes': list(range(5,10))+[None],
 'clf__random_state': [0]}

In [None]:
columns_to_drop = ['job_title', 'company', 'location', 'salary',
       'link', 'summary', 'exch_rate', 'pay_rate', 'lower', 'upper',
       'median_salary']

tfidfmax_features = 100

In [None]:
model_compare(classifier1,classifier2,params1,params2)

In [None]:
X_test.shape

In [None]:
lgr = LogisticRegression(C=100)

lgr.fit(TfidfVectorizer(tokenizer=Tokenizer, stop_words=list(set(stopwordz)),
                     min_df=.0025, max_df=0.9, max_features=2,ngram_range=(1,3)).fit_transform(TextSelector('summary').fit_transform(X_train)).toarray(),y_train)

lgr.score(TfidfVectorizer(tokenizer=Tokenizer, stop_words=list(set(stopwordz)),
                     min_df=.0025, max_df=0.9, max_features=2,ngram_range=(1,3)).fit_transform(TextSelector('summary').fit_transform(X_train)).toarray(),y_train)

In [None]:
TextSelector('summary').fit_transform(X_train)

In [None]:









classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('summary')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=list(set(stopwordz)),
                     min_df=.0025, max_df=0.9, max_features=tfidfmax_features,ngram_range=(1,3))),
        ])),('nominal',Pipeline([
            ('salary_prep', SalaryPreprocessor(columns_to_drop=columns_to_drop)),
            ('encoder',OneHotEncoder(categories = "auto",sparse=False,handle_unknown='ignore')),]))
    ])),
    ('clf', LogisticRegression(solver='lbfgs'))])

In [None]:
classifier.fit(X_train,y_train)

In [None]:
classifier.score(X_train,y_train)

In [None]:
classifier.score(X_test,y_test)

In [None]:
cross_val_score(classifier,X_train,y_train,cv=5,n_jobs=-1).mean()

In [None]:
y_pred_test = classifier.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred_test,labels=[1,0])

In [None]:
tfidf_featurenames = classifier.steps[0][1].get_params()['transformer_list'][0][1].steps[1][1].get_feature_names()


In [None]:
encoder_featurenames = classifier.steps[0][1].get_params()['transformer_list'][1][1].steps[1][1].get_feature_names()

In [None]:
featurenames = np.concatenate((tfidf_featurenames,encoder_featurenames))

In [None]:
coefficients = classifier.steps[1][1].coef_

In [None]:
coef_table = pd.DataFrame(coefficients,columns=featurenames,index=['coef']).T

coef_table['coef_abs'] = coef_table['coef'].apply(abs)

coef_table.sort_values(by='coef_abs',ascending=False)

In [None]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import make_pipeline,make_union

columns_to_drop = ['job_title','location','salary','link','exch_rate','pay_rate','lower','upper','median_salary',
                   'company','summary']

columns_to_str=['summary']

salary_prep = SalaryPreprocessor(columns_to_drop=columns_to_drop,
                            columns_to_str=columns_to_str)

encoder = OneHotEncoder(categories = "auto",sparse=False,handle_unknown='ignore')

scaler = StandardScaler()
model = LogisticRegression(solver='liblinear', random_state=0)


pipe = Pipeline(steps = [('sal_prep',salary_prep),('dummy',encoder),('scaler',scaler),('model',model)])

FeatureUnion = make_union()

fullpipe = Pipeline([('features',FeatureUnion([
    
    #Part 1
    ()]))
                    
                    
                    
                    
                    ])

In [None]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('Text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('TotalWords')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
#    ('clf', RandomForestClassifier()),
    ])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pipe.score(X_test,y_test)

In [None]:
cross_val_score(pipe,X_train,y_train,cv=5).mean()

In [None]:
y_pred_test = pipe.predict(X_test)

confusion_matrix(y_test,y_pred_test,labels=[1,0])

In [None]:
np.hstack(pipe.get_params()['dummy'].categories_)

In [None]:
pd.DataFrame(pipe.get_params()['model'].coef_[0],index=np.hstack(pipe.get_params()['dummy'].categories_),columns=["coef"])

In [None]:
from sklearn.preprocessing import Binarizer, LabelBinarizer,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


df1 = df[['region','level','country','job_category','salary_cat']]
df1 = pd.get_dummies(df1,columns=['region','level','country','job_category'],drop_first=True)

ordinal = OrdinalEncoder()

y = pd.DataFrame(ordinal.fit_transform(df[['salary_cat']]),columns=df[['salary_cat']].columns)
#y = np.hstack(y)
X = df1.drop('salary_cat',axis=1)

In [None]:
scaler = StandardScaler(with_mean=False)

X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)



In [None]:
pipeline1 = Pipeline((
    ('clf',KNeighborsClassifier()),
))

parameter1 = {'clf__metric': ['minkowski','euclidean'],
              'clf__n_jobs': [-1],
              'clf__n_neighbors': range(1,50),
              'clf__weights': ['uniform','distance']}

grid1 = (pipeline1,parameter1)

##########################################
pipeline2 = Pipeline((
    ('pre',SalaryPreprocessor(columns_to_drop=columns_to_drop,
                            columns_to_dummify=columns_to_dummify,
                            columns_to_str=columns_to_str,
                           columns_for_tfidf=columns_for_tfidf)),
    ('scale',StandardScaler(with_mean=False)),
    ('clf',LogisticRegression()),
))

parameter2 = {'clf__verbose':[2],
              'clf__random_state':[0],
              'clf__max_iter':[10000],
              'clf__C': np.logspace(-2,0,10),
              'clf__fit_intercept': [True,False],
              'clf__penalty': ['l2','l1'],
              'clf__solver': ['liblinear']}


grid2 = (pipeline2,parameter2)
##########################################
pipeline3 = Pipeline((
    ('clf',MultinomialNB()),
))

parameter3 = {'clf__alpha': np.logspace(-4,-2,10)}



grid3 = (pipeline3,parameter3)
##########################################
pipeline4 = Pipeline((
    ('clf',SVC()),
))

parameter4 = {'clf__C': np.logspace(-2,0,20),
             'clf__gamma': np.logspace(-2,0,20),
             'clf__kernel': ['rbf', 'poly'],
             'clf__max_iter': [-1],
             'clf__random_state': [0],
             'clf__verbose': [2]}  

grid4 = (pipeline4,parameter4)


In [None]:
grids = [grid1,grid2,grid3,grid4]

In [None]:
re.search(r"\.([A-Z]+.[A-Za-z]+)",str(pipeline4.get_params()['clf'].__class__)).group(1)

In [None]:
def model_compare(grids):
    
    results_dict = {}
    for i in range(len(grids)):

        grid_s = GridSearchCV(grids[i][0],grids[i][1],verbose=2,n_jobs=-1,cv=5)
        grid_s.fit(X_train,y_train)
        estimator_name = re.search(r"\.([A-Z]+.[A-Za-z]+)",str(grid_s.get_params()['estimator__clf'].__class__)).group(1)
        results_dict[estimator_name] = [grid_s.best_score_]
        results_dict[estimator_name].append(grid_s.best_params_)
    
    return (grid_s,results_dict)

In [None]:
grids=[grid2]
model_grid = model_compare(grids)

In [None]:
#grids=[grid2]
model_grid = model_compare(grids)

In [None]:
pd.set_option('display.max_colwidth',5000)
grid_df = pd.DataFrame(model_grid[1],index=['score','best_params']).T

grid_df

In [None]:
y_pred_test = model_grid[0].predict(X_test)
confusion_matrix(y_test,y_pred_test,labels=[1,0])

In [None]:
y_pred = model_grid[0].predict(X)
confusion_matrix(y,y_pred,labels=[1,0])

In [None]:
grids=[grid2]

best_model = model_compare(grids)

In [None]:
y_pred_test = best_model[0].predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred_test,labels=[1,0])

In [None]:
print(classification_report(y_test,y_pred_test,labels=[1,0]))

In [None]:
best_model[0].fit(X_train,y_train)

In [None]:
best_C = best_model[0].best_params_['clf__C']
best_penalty = best_model[0].best_params_['clf__penalty']

In [None]:
best_model[0].estimator.steps[0][1]

In [None]:
best_model[0].estimator.steps[0][1].set_params(C=best_C,penalty=best_penalty,random_state=0,solver='liblinear').fit(X_train,y_train)

In [None]:
X_train.columns

In [None]:
coefficients = pd.DataFrame(np.hstack(best_model[0].estimator.steps[0][1].coef_),index=X_train.columns,columns=['coef'])

coefficients['coef_abs'] = coefficients.coef.apply(abs)

coefficients.sort_values(by='coef_abs',ascending=False)

In [None]:
pd.Series(grid_df.loc['LogisticRegression','best_params'])

In [None]:
pipeline2.set_params(clf__C =0.0774263682681127,
                     clf__penalty = 'l1', 
                     clf__solver = 'liblinear', clf__verbose=1,
                    clf__max_iter=10000)

In [None]:
pipeline2.fit(X_train,y_train)

In [None]:
cross_val_score(pipeline2,X_train,y_train,cv=5).mean()
cross_val_score(pipeline2,X_train,y_train,cv=5).std()

In [None]:
cross_val_score(grid_s,X_train,y_train,cv=5).mean()
cross_val_score(grid_s,X_train,y_train,cv=5).std()

In [None]:
logr = LogisticRegression(n_jobs=-1,verbose=1,random_state=0,max_iter=10000)

param_grid = {'metric': ['minkowski','euclidean'],
              'n_jobs': [-1],
              'verbose':[1],
              'random_state':[0],
              'max_iter':[10000],
              'n_neighbors': range(1,50),
              'weights': ['uniform','distance'],
             'C': np.logspace(-1,0,10),
             'fit_intercept': [True,False],
             'penalty': ['l2','l1'],
             'solver': ['liblinear']}

grid_estimator = GridSearchCV([knn,logr], param_grid=param_grid,n_jobs=-1,cv=5)

In [None]:
grid_estimator.get_params()

In [None]:
grid_estimator.fit(X_train,y_train)

In [None]:
grid_estimator.score(X_train,y_train)
grid_estimator.best_params_

In [None]:
cross_val_score(grid_estimator,X_train,y_train,cv=5,n_jobs=-1).mean()

In [None]:
LogisticRegression().get_params()

In [None]:
scores = []
neighbors = range(1,50)
for neigh in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neigh,n_jobs=-1)
    knn.fit(X_train,y_train)
    scores.append(cross_val_score(knn,X_train,y_train,cv=5,n_jobs=-1).mean())

In [None]:
import matplotlib.pyplot as plt

scores = np.array(scores)
plt.plot(x,scores);


#Maximum k occurs at 24 neighbors
scores.argmax()+1

In [None]:
knn = KNeighborsClassifier(n_neighbors=scores.argmax()+1,n_jobs=-1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
print(f"Training score:{knn.score(X_train,y_train):.3f}")
print(f"Testing score:{knn.score(X_test,y_test):.3f}")
print(f"Cross Validation score:{cross_val_score(knn,X_train,y_train,cv=5,n_jobs=-1).mean():.3f}")
print(f"Cross Validation variance:{cross_val_score(knn,X_train,y_train,cv=5,n_jobs=-1).std():.3f}")

In [None]:
y_pred_test = knn.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred_test,labels=[1,0])

In [None]:
print(classification_report(y_test,y_pred_test,labels=[1,0]))