## Solution for HW - 3: Classification

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('data/course_lead_scoring.csv')
df.head(3)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1


In [3]:
df.head(3).T

Unnamed: 0,0,1,2
lead_source,paid_ads,social_media,events
industry,,retail,healthcare
number_of_courses_viewed,1,1,5
annual_income,79450.0,46992.0,78796.0
employment_status,unemployed,employed,unemployed
location,south_america,south_america,australia
interaction_count,4,1,3
lead_score,0.94,0.8,0.69
converted,1,0,1


### 🧹 Data Preparation

Check if the missing values are presented in the features.  

If there are missing values:  
- For **categorical features**, replace them with `'NA'`  
- For **numerical features**, replace them with `0.0`

In [4]:
df.isnull().sum().sort_values(ascending=False)

annual_income               181
industry                    134
lead_source                 128
employment_status           100
location                     63
number_of_courses_viewed      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [6]:
categorical_col = list(df.dtypes[df.dtypes == 'object'].index)
display(categorical_col)

numeric_col = list(df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index)
display(numeric_col)

['lead_source', 'industry', 'employment_status', 'location']

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [7]:
df[categorical_col] = df[categorical_col].fillna('NA')
df[numeric_col] = df[numeric_col].fillna(0)

In [8]:
# Check if we have any missing number - No
df.isnull().sum().sort_values(ascending=False)

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

 - 1.  What is the most frequent observation (mode) for the column industry?

In [9]:
df.industry.mode()

0    retail
Name: industry, dtype: object

- 2. Create the **correlation matrix** for the numerical features of your dataset.  
In a correlation matrix, you compute the **correlation coefficient** between every pair of numerical features.

Now, identify **which two features** among the following have the **highest correlation**:

- `interaction_count` and `lead_score`  
- `number_of_courses_viewed` and `lead_score`  
- `number_of_courses_viewed` and `interaction_count`  
- `annual_income` and `interaction_count`  

In [10]:
# Correlation matrix
corr = df.corr(numeric_only=True).round(3)

# Define the pairs to compare
pairs = {
    ('interaction_count', 'lead_score'): corr.loc['interaction_count', 'lead_score'],
    ('number_of_courses_viewed', 'lead_score'): corr.loc['number_of_courses_viewed', 'lead_score'],
    ('number_of_courses_viewed', 'interaction_count'): corr.loc['number_of_courses_viewed', 'interaction_count'],
    ('annual_income', 'interaction_count'): corr.loc['annual_income', 'interaction_count']
}

# Find the pair with the highest correlation - pairs.get is a function that retrieves the value for a given key
max_pair = max(pairs, key=pairs.get) # Find the key in the dictionary whose value (correlation coefficient) is the largest
max_value = pairs[max_pair]

print(f"The pair with the highest correlation is {max_pair} with correlation = {max_value:.3f}")

The pair with the highest correlation is ('annual_income', 'interaction_count') with correlation = 0.027


Split your dataset into **train**, **validation**, and **test** sets with the following proportions:

- **Training set:** 60%  
- **Validation set:** 20%  
- **Test set:** 20%  

Use **Scikit-Learn’s** `train_test_split` function and set the random seed to `42` for reproducibility.  

Make sure that the **target variable** (the one you’re trying to predict) is **not included** in your feature dataframe.

In [11]:
df_full_train, df_test = train_test_split(df,
                        test_size=0.2,
                        random_state=1
                        )
df_train, df_val = train_test_split(df_full_train,
                        test_size=0.25,
                        random_state=1)

In [12]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
# .values ensures that we take the values of the churn variable. Otherwise, we will take an index and the values.
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

# Delete the outcome variable so that we don't use it as a control variable
del df_train['converted']
del df_val['converted']
del df_test['converted']

- 3. Calculate the **mutual information score** between the target variable `converted` and the other **categorical variables** in the dataset.  
Use **only the training set** for this calculation.

Round each score to **2 decimal places** using `round(score, 2)`.

Identify which of the following variables has the **highest mutual information score**:

- `industry`  
- `location`  
- `lead_source`  
- `employment_status`  


In [15]:
def mutual_info_score_func(series):
    return mutual_info_score(series, df_full_train.converted).round(4)

In [16]:
mi = df_full_train[categorical_col].apply(mutual_info_score_func)
mi.sort_values(ascending=False)

lead_source          0.0246
employment_status    0.0127
industry             0.0082
location             0.0012
dtype: float64

- 4. Now let's train a **logistic regression**.    

Remember that we have several **categorical variables** in the dataset. Include them using **one-hot encoding**.  
Fit the model on the **training dataset**.  
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:  

    ```python
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    ```

Calculate the **accuracy** on the **validation dataset** and round it to **2 decimal digits**.

What accuracy did you get?

- `0.64`  
- `0.74`  
- `0.84`  
- `0.94`


- Remove our dependent variable `converted` from the dataset

In [17]:
numeric_col.remove('converted')

In [18]:
# Let's create an instance of DictVectorizer. sparse=False) - return a usual numpy array, if you don't do it like this u will get 
dv = DictVectorizer(sparse=False)

# Try this - df_full_train[['gender', 'contract']].iloc[:10].to_dict(orient='records')
train_dict = df_train[categorical_col + numeric_col].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_col + numeric_col].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [19]:
model = linear_model.LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

In [20]:
y_proba_pred = model.predict_proba(X_val)[:, 1]
y_pred = (y_proba_pred >= 0.5).astype(int)
y_pred

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1])

In [24]:
print(f'Accurace score is {round(metrics.accuracy_score(y_val, y_pred), 4)}')

Accurace score is 0.6997
