In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [None]:
!wget $data

--2025-10-28 09:48:59--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-28 09:48:59 (5.95 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [None]:
df = pd.read_csv(data)

df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [None]:
df.dtypes

Unnamed: 0,0
lead_source,object
industry,object
number_of_courses_viewed,int64
annual_income,float64
employment_status,object
location,object
interaction_count,int64
lead_score,float64
converted,int64


In [None]:
df.isna().sum()

Unnamed: 0,0
lead_source,128
industry,134
number_of_courses_viewed,0
annual_income,181
employment_status,100
location,63
interaction_count,0
lead_score,0
converted,0


In [None]:
strings = list(df.dtypes[df.dtypes =='object'].index)

strings

['lead_source', 'industry', 'employment_status', 'location']

In [None]:
for cols in strings:
  df[cols] = df[cols].fillna('NA')

In [None]:
df['annual_income'] = df['annual_income'].fillna(0)

In [None]:
df.isna().sum()

Unnamed: 0,0
lead_source,0
industry,0
number_of_courses_viewed,0
annual_income,0
employment_status,0
location,0
interaction_count,0
lead_score,0
converted,0


*Question 1*
- What is the most frequent observation (mode) for the column industry?

In [None]:
df.industry.value_counts()

Unnamed: 0_level_0,count
industry,Unnamed: 1_level_1
retail,203
finance,200
other,198
healthcare,187
education,187
technology,179
manufacturing,174
,134


The most frequent observation (mode) for the column industry is retail.

In [None]:
numbers = list(df.select_dtypes(include = ['int64','float64']).columns)

numbers

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

**Question 2**
- Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

- What are the two features that have the biggest correlation?

- interaction_count and lead_score
- number_of_courses_viewed and lead_score
- number_of_courses_viewed and interaction_count
- annual_income and interaction_count

- Only consider the pairs above when answering this question.

In [None]:
corr_matrix = df[numbers].corr()

corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

### 🧮 Question 3

Calculate the **mutual information score** between `converted` and other categorical variables in the dataset.  
Use the **training set only**.

Round the scores to **2 decimal places** using `round(score, 2)`.

**Which of these variables has the biggest mutual information score?**

- `industry`  
- `location`  
- `lead_source`  
- `employment_status`


In [None]:
from sklearn.metrics import mutual_info_score

cols = df.columns.drop('converted')
n = len(cols)
mi_matrix = pd.DataFrame(index=cols, columns=['converted'])

for i in range(n):
  mi = mutual_info_score(df[cols[i]], df['converted'])
  mi_matrix.loc[cols[i], 'converted'] = mi

mi_matrix = mi_matrix.astype(float)
mi_matrix_rounded = mi_matrix.round(2)

print(mi_matrix_rounded)

                          converted
lead_source                    0.03
industry                       0.01
number_of_courses_viewed       0.12
annual_income                  0.58
employment_status              0.01
location                       0.00
interaction_count              0.08
lead_score                     0.05




In [None]:
numerical = [col for col in numbers if col != 'converted']


In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[strings + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)


### 🤖 Question 4

Now let's **train a Logistic Regression model**.

Remember that we have several **categorical variables** in the dataset — include them using **one-hot encoding**.

Fit the model on the **training dataset**.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

```python
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
```
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
- What accuracy did you get?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
df_val.isna().sum()

Unnamed: 0,0
lead_source,0
industry,0
number_of_courses_viewed,0
annual_income,0
employment_status,0
location,0
interaction_count,0
lead_score,0


In [None]:
val_dict = df_val[strings + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
y_pred = model.predict(X_val)
org_accuracy = accuracy_score(y_val, y_pred)

org_accuracy


0.6996587030716723

In [None]:
# Store the rounded accuracy in a new variable
rounded_org_accuracy = round(org_accuracy, 2)
display(rounded_org_accuracy)

0.7

**Question 5**
* Let's find the least useful feature using the feature elimination technique.
* Train a model using the same features and parameters as in Q4 (without rounding).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?

* 'industry'
* 'employment_status'
* 'lead_score'
* Note: The difference doesn't have to be positive.

In [None]:
features = df_train.columns.to_list()

features

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']

In [None]:
remove_features = ['industry', 'employment_status', 'lead_score']

remove_features

['industry', 'employment_status', 'lead_score']

In [None]:
scores = {}

for x in remove_features:
  cols = features.copy()
  cols.remove(x)

  dv = DictVectorizer(sparse=False)
  train_dict = df_train[cols].to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)
  model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  val_dict = df_val[cols].to_dict(orient='records')
  X_val = dv.transform(val_dict)
  yval_preds = model.predict(X_val)

  scores[x] = accuracy_score(y_val, yval_preds)




In [None]:

scores

{'industry': 0.6996587030716723,
 'employment_status': 0.6962457337883959,
 'lead_score': 0.7064846416382252}

In [None]:
df_scores = pd.DataFrame({
    'accuracy': list(scores.values()),
    'accuracy_diff': [org_accuracy - v for v in scores.values()]
}, index=scores.keys()).sort_values(by='accuracy_diff')

df_scores

Unnamed: 0,accuracy,accuracy_diff
lead_score,0.706485,-0.006826
industry,0.699659,0.0
employment_status,0.696246,0.003413


industry has the smallest difference

**Question 6**
* Now let's train a regularized logistic regression
* Let's try the following values of the parameter C: [0, 0.01, 0.1, 1, 10]
* Train models using all the features as in Q4
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits
* Which of these C leads to the best accuracy on the validation set?
* note: If there are multiple options, select the smallest C.

In [None]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[strings + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [None]:
scores = []

for C in [0.01, 0.1, 1, 10]:
  model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  val_dict = df_val[strings + numerical].to_dict(orient='records')
  X_val = dv.transform(val_dict)
  yval_preds = model.predict(X_val)
  score = accuracy_score(y_val, yval_preds)
  scores.append(score)
  print(f'C: {C}, Accuracy: {score}')



C: 0.01, Accuracy: 0.6996587030716723
C: 0.1, Accuracy: 0.6996587030716723
C: 1, Accuracy: 0.6996587030716723
C: 10, Accuracy: 0.6996587030716723


C=0.01 leads to the best accuracy on the validation set


