## Homework - 4 (Evaluation)

# 🧩 DATA PREPARATION
# -------------------------------------------------------------
1️⃣ Check if there are missing values in the features.

2️⃣ If missing values exist:
     • For categorical features → replace with 'NA'
     • For numerical features   → replace with 0.0

3️⃣ Split the dataset into three parts:
     • Train → 60%
     • Validation → 20%
     • Test → 20%

 Use the train_test_split() function with random_state=1

In this dataset our desired target for classification task will be `converted` variable - has the client signed up to the platform or not.
# -------------------------------------------------------------


In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer 
from sklearn import linear_model
from sklearn import metrics

In [3]:
# url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
# data = pd.read_csv(url)
data = pd.read_csv('data/course_lead_scoring.csv')
display(data.head(3))
print('\n')
display(data.tail(3))


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1






Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1
1461,organic_search,finance,3,92855.0,student,north_america,3,0.41,1


In [4]:
display(data.head().T)

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


- Number of missing values

In [6]:
data.isnull().sum().sort_values(ascending=False)

annual_income               181
industry                    134
lead_source                 128
employment_status           100
location                     63
number_of_courses_viewed      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

- Descriptive stats

In [8]:
data.describe(include='object')

Unnamed: 0,lead_source,industry,employment_status,location
count,1334,1328,1362,1399
unique,5,7,4,7
top,organic_search,retail,self_employed,north_america
freq,282,203,352,225


In [10]:
data.describe(include='number').round(3)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031,59886.273,2.977,0.506,0.619
std,1.45,15070.14,1.682,0.288,0.486
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.262,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


- Check data types

In [7]:
data.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [47]:
categor_feature = list(data.dtypes[data.dtypes=='object'].index)
numer_feature = list(data.dtypes[data.dtypes!='object'].index)
display(categor_feature, numer_feature)

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

- Fill missing observations

In [48]:
data[categor_feature] = data[categor_feature].fillna('NA')
data[numer_feature] = data[numer_feature].fillna(0)

data.isnull().sum().sort_values(ascending=False)

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [43]:
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution. Use train_test_split function for that with random_state=1
# Train, Test, Validation split

df_full_train, df_test = train_test_split(data,
                                           test_size=0.2,
                                           random_state=1
                                           )
df_train, df_val = train_test_split(df_full_train,
                                    test_size=0.25,
                                    random_state=1
                                    )

len(df_train), len(df_test), len(df_val)

(876, 293, 293)

In [49]:
# Prepare final control variables
features = numer_feature + categor_feature
features.remove('converted')
target = 'converted'

In [45]:
X_train = df_train[features].reset_index(drop=True)
y_train = df_train.converted.values

X_val = df_val[features].reset_index(drop=True)
y_val = df_val.converted.values

X_test = df_test[features].reset_index(drop=True)
y_test = df_test.converted.values

### 🧮 Question 1: ROC AUC feature importance
ROC AUC could also be used to evaluate feature importance of numerical variables.

For each numerical variable, use it as score (aka prediction) and compute the AUC with the y variable as ground truth.
Use the training dataset for that
If your AUC is < 0.5, invert this variable by putting "-" in front

(e.g. -df_train['balance'])

AUC can go below 0.5 if the variable is negatively correlated with the target variable. 
You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.

Which numerical variable (among the following 4) has the highest AUC?

    - lead_score
    - number_of_courses_viewed
    - interaction_count
    - annual_income

In [50]:
numer_feature

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [None]:
# Exlude target variable
numer_control = numer_feature.remove(target)
for var in numer_control:
    auc = metrics.roc_auc_score(y_train, X_train[var])
    if auc < 0.5:
        auc = metrics.roc_auc_score(y_train, -X_train[var])
    print(f'Feature name {var} has AUC of {auc:.3f}')    

TypeError: 'NoneType' object is not iterable