In [None]:
import numpy as np                  # Mathetimatical Operations
import pandas as pd                 # Data manipulation

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt     # Used for plotting graphs.
%matplotlib inline

In [None]:
# DATA COLLECTION

df = pd.read_csv('C:/Users/Amit Gupta/Desktop/TP/Code/Apply_Rate_2019.csv')
df.head() # shows top 5 rows
print('Total number of observations in the dataset are:',df.shape[0]) # Shape() function returns the dimensions of the array. 
df.info() # Gives the structure of the data w.r.t. different columns.
df.drop(['apply'],axis=1).describe() # Level 1 is used to delete the column named 'apply'. Describe() is used for getting the statistical information about the field/column.
df['job_age_days'].describe()
pd.set_option('display.expand_frame_repr', False)
df.describe()

In [None]:
# Checking the distribution for classes who applied and did not apply

count_classes = pd.value_counts(df['apply'], sort = True)  # This gives the different set of values that the column 'apply' can take. Also, it plots the graph to show the counts of rows having both the values.
count_classes.plot(kind = 'bar')
plt.title("Apply Rate")
plt.xticks(range(2))
plt.xlabel("Class")
plt.ylabel("Frequency");

print('Count of number of customers who didnt apply:',df['apply'].value_counts()[0])
print('Count of number of customers who applied:',df['apply'].value_counts()[1])
print('Percentage of apply to non apply as per the data',df['apply'].value_counts()[0]/df['apply'].value_counts()[1],'%')


In [None]:
# Checking the correlation between the features
f, ax = plt.subplots(figsize=(12, 10))
corr = df.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="Reds",fmt='.2f',
linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Click Through Rate Variable Correlation Heatmap', fontsize=14)

# Select upper triangle of correlation matrix
upper = df.corr().where(np.triu(np.ones(df.corr().shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.50)]
df.drop(df[to_drop], axis=1)
df = df.drop(df[to_drop], axis=1)
to_drop

In [None]:
# Box Plot for the variables

l = ['title_proximity_tfidf', 'description_proximity_tfidf',
       'query_jl_score', 'query_title_score',
       'city_match', 'job_age_days']
for i in range(0,len(l)):
    ax = sns.boxplot(df[l[i]],data = df)
    plt.show() 
    
# Checking the distribution

for feature in df.columns[:-3]: # Plotting the values for various columns with respect to whether they have applied or not.
    ax = plt.subplot()
    sns.distplot(df[df['apply'] == 1][feature], bins=50, label='Anormal(Apply=1)')
    sns.distplot(df[df['apply'] == 0][feature], bins=50, label='Normal(Apply=0)')
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(feature))
    plt.legend(loc='best')
    plt.show()

import numpy as np
from scipy.stats import kurtosis
from scipy.stats import skew
df.skew(axis = 0, skipna = True) 
df.kurtosis()

In [None]:
# DATA CLEANING
    
print(df.shape) # Gives the dimension of the dataset.
df = df.drop_duplicates(keep = 'first') # Considers first value as unique and all the other similar values as duplicate.
df.shape 
# df.isnull().sum() # Getting the number of null values in all the columns.
print("The number of null values for the all the columns in df are:")
df.iloc[:,0:len(df.columns)].isnull().sum() # Getting the number of null values in all the columns.

# Columns with null values are title_proximity_tfidf, description_proximity_tfidf, city_match
mean_title_proxmity_tfidf = df['title_proximity_tfidf'].mean()
mean_title_proxmity_tfidf
mean_description_proxmity_tfidf = df['description_proximity_tfidf'].mean()
mean_description_proxmity_tfidf
df['city_match'].value_counts()
t1 = df['city_match'].value_counts()[0]
t2 = df['city_match'].value_counts()[1]
if t1>t2:
    city_match_imputed_value = 0
elif t1<t2:
    city_match_imputed_value = 1
city_match_imputed_value


In [None]:
# Null Value Detection and Treatment

# Way-1
df['title_proximity_tfidf']=df['title_proximity_tfidf'].fillna(mean_title_proxmity_tfidf)
df.iloc[:,0:len(df.columns)].isnull().sum() # Getting the number of null values in all the columns.
df['description_proximity_tfidf']=df['description_proximity_tfidf'].fillna(mean_description_proxmity_tfidf)
df.iloc[:,0:len(df.columns)].isnull().sum() # Getting the number of null values in all the columns.
df['city_match']=df['city_match'].fillna(city_match_imputed_value)
df.iloc[:,0:len(df.columns)].isnull().sum() # Getting the number of null values in all the columns.

# Way-2
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(df)
df= imp.transform(df)
#This will look for all columns where we have NaN value and replace the NaN value with specified test statistic. 

#Other Ways to treat null values
# Missing value treatment by removing all the rows having missing values
# Copying the dataframe
df1=df
# drop rows with missing values
df1.dropna(inplace=True)
# summarize the number of rows and columns in the dataset
print(df1.shape)
df1.iloc[0:10,0]

# Missing value treatment using Imputation:
df2=df
# count the number of NaN values in each column
print(df2.isnull().sum())
# fill missing values with mean column values
df2.fillna(df2.mean(), inplace=True)
# count the number of NaN values in each column after replacing NaN values
print(df2.isnull().sum())

# Using Imputer class to treat missing values

from sklearn.preprocessing import Imputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
df3=df
df3.columns
# split dataset into inputs and outputs
values = df3.values
X = values[:,0:6]
y = values[:,6]
# fill missing values with mean column values
imputer = Imputer()
transformed_X = imputer.fit_transform(X)
len(transformed_X)
for i in range(len(X[4])):
    if(X[i,4] > 0.5 and X[i,4] < 1):
        X[i,4]=1
    else:
        X[i,4]=0

for i in range(len(X[5])):
    if(X[i,5] > 0.5 and X[i,5] < 1):
        X[i,5]=1
    else:
        X[i,5]=0

In [None]:
# Outlier Treatment
        
# Getting all col names
col_names = []
for col in df.columns: 
    col_names.append(col)
col_names

q1_list = []
q3_list = []
min_list =[]
max_list = []

# Finding the oth, 25th, 75th, 100th percentile for all the columns
a= len(df.columns)-3
for i in range(a):
    min_list.append(df[col_names[i]].describe()[3])
    q1_list.append(df[col_names[i]].describe()[4])
    q3_list.append(df[col_names[i]].describe()[6])
    max_list.append(df[col_names[i]].describe()[6])
min_list
q1_list
q3_list
max_list

# Finding the IQR for all columns
iqr = []
for i in range(len(df.columns)-3):
    iqr.append(q3_list[i]-q1_list[i])
iqr    

# Finding the min and max range of values for all columns and storing them in a list
min_value_cols = []
max_value_cols = []
for i in range(len(df.columns)-3):
    min_value_cols.append(q1_list[i]-1.5*iqr[i])
    max_value_cols.append(q3_list[i]+1.5*iqr[i])
min_value_cols
max_value_cols

# Replacing outlier values with boundary values
temp = len(df.columns[:-3])
temp
x=0
y=0
z=0
for i in range(len(df)):
    print(1)
    z=0
    for j in range(temp):
        print(2)
        if(df.iloc[i,j] < min_value_cols[z]):
            print(3)
            df.iloc[i,j] = min_value_cols[z]
            x = x+1
            y = y+1
            z = z+1
        elif(df.iloc[i,j] > max_value_cols[z]):
            print(4)
            df.iloc[i,j] = max_value_cols[z]
            x = x+1
            y = y+1
            z=z+1
        else:
            print(5)
            x = x+1
            y = y+1
            z=z+1
    
# Storing the updated data in excel
df.to_csv(r'C:\Users\Amit Gupta\Desktop\TP\Code\Null_Outlier_Treated_data.csv')

In [None]:
# Log Transformation for all the columns which are not normally distributed.

df_temp = np.log(df[['title_proximity_tfidf', 'description_proximity_tfidf','query_jl_score', 'query_title_score','city_match', 'job_age_days']])
df_temp.head()
# Log Transformed Data Stored
df.to_csv(r'C:\Users\Amit Gupta\Desktop\TP\Code\Log_Transformed_data.csv')

In [None]:
# MODELLING

# Splitting the dataset into training and testing

# Splitting the dataset by date
train = df.loc[df['search_date_pacific']<'2018-01-27']
test = df.loc[df['search_date_pacific'] == '2018-01-27']

# Drop the unnecessary columns
train.drop(['search_date_pacific','class_id'],axis=1,inplace = True)
test.drop(['search_date_pacific','class_id'],axis=1,inplace = True)

# Drop irrelevant features
X = df.drop(['search_date_pacific','class_id','apply'],axis=1)
y = df['apply']

# Reset the index
X = X.reset_index(drop='index')
y = y.reset_index(drop='index')

X_train = train.drop(['apply'],axis=1)
y_train = train['apply']
X_test = test.drop(['apply'],axis=1)
y_test = test['apply']


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Define a function which will be used to get the important parameters like AUC, Classification report

def report(test_set, predictions,labels,title):
    print('F1 score is:', f1_score(test_set,predictions))
    print("AUC-ROC is: %3.2f" % (roc_auc_score(test_set, predictions)))
    plot_confusion_matrix(confusion_matrix(test_set, predictions),labels,title)
    
    #plot the curve
    fpr, tpr, threshold = roc_curve(test_set,predictions)
    auc = roc_auc_score(test_set,predictions)
    fig, ax = plt.subplots(figsize=(6,6))
    ax.set_title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b',label='Model - AUC = %0.3f'% auc)
    ax.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--', label='Chance')
    ax.legend()
    ax.set_xlim([-0.1,1.0])
    ax.set_ylim([-0.1,1.01])
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')
    plt.show()

In [None]:
# XGBoost
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
#labels = ['No Apply', 'Apply']
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# LightGBM
import lightgbm as lgb

model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
#labels = ['No Apply', 'Apply']
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

regressor = RandomForestRegressor(n_estimators=20, random_state=20)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
accuracy_score(y_test, y_pred.round(), normalize=False)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))