In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tabulate import tabulate



In [30]:
df=pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [31]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [33]:
cat_col=['lead_source', 'industry', 'employment_status', 'location']
for cols in cat_col:
    fill_NA=df.fillna('NA')
    
print(fill_NA)
    


         lead_source       industry  number_of_courses_viewed annual_income  \
0           paid_ads             NA                         1       79450.0   
1       social_media         retail                         1       46992.0   
2             events     healthcare                         5       78796.0   
3           paid_ads         retail                         2       83843.0   
4           referral      education                         3       85012.0   
...              ...            ...                       ...           ...   
1457        referral  manufacturing                         1            NA   
1458        referral     technology                         3       65259.0   
1459        paid_ads     technology                         1       45688.0   
1460        referral             NA                         5       71016.0   
1461  organic_search        finance                         3       92855.0   

     employment_status       location  interaction_

In [34]:
df["annual_income"].fillna(0.0)

0       79450.0
1       46992.0
2       78796.0
3       83843.0
4       85012.0
         ...   
1457        0.0
1458    65259.0
1459    45688.0
1460    71016.0
1461    92855.0
Name: annual_income, Length: 1462, dtype: float64

In [35]:
#all_columns = df.columns
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('NA')  
    else:
        df[col] = df[col].fillna(0.0)   
df
        

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


# Question 1

In [36]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

# Question 2

In [37]:
correlation_list = [
    ['interaction_count', 'lead_score'],
    ['number_of_courses_viewed', 'lead_score'],
    ['number_of_courses_viewed', 'interaction_count'],
    ['annual_income', 'lead_score']
]

max_corr = -1
max_pair = None

for pair in correlation_list:
    corr_value = df[pair].corr().iloc[0, 1]
    if abs(corr_value) > max_corr:
        max_corr = abs(corr_value)
        max_pair = pair

print(f"Highest correlation: {max_pair} = {max_corr}")


Highest correlation: ['number_of_courses_viewed', 'interaction_count'] = 0.023565222882888037


# Question 3

In [38]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")
df_train

Train size: 876
Validation size: 293
Test size: 293


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03,0
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77,1
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59,1
835,,technology,1,74956.0,employed,europe,3,0.34,1
837,organic_search,retail,3,59335.0,student,australia,1,0.98,1
...,...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33,1
401,social_media,retail,3,64969.0,employed,north_america,1,0.18,0
957,,education,3,89042.0,employed,asia,4,0.75,1
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65,0


In [39]:

labels = df['converted'].loc[df_train.index]

selected_features = ['industry', 'location', 'lead_source', 'employment_status']

categorical_features = df[selected_features].select_dtypes(exclude="number").columns.tolist()
mi_scores = df_train[categorical_features].apply(lambda series: mutual_info_score(series, labels))
mi_scores_sorted = mi_scores.sort_values(ascending=False)

print(mi_scores_sorted.round(2))

highest_score_variable = mi_scores_sorted.idxmax()
highest_score_value = mi_scores_sorted.max()

print(f"The variable with the biggest mutual information score is '{highest_score_variable}' with a score of {highest_score_value:.2f}.")

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64
The variable with the biggest mutual information score is 'lead_source' with a score of 0.04.


# Question 4

In [40]:
numerical_features = df_train.select_dtypes(include="number").columns.tolist()
categorical_features = df_train.select_dtypes(exclude="number").columns.tolist()
feature_columns = categorical_features + numerical_features

train_records = df_train[feature_columns].to_dict(orient='records')
validation_records = df_val[feature_columns].to_dict(orient='records')

vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform(train_records)
X_val = vectorizer.transform(validation_records)

y_train = df.converted.loc[df_train.index].values
y_val = df.converted.loc[df_val.index].values


classifier = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
classifier.fit(X_train, y_train)

validation_predictions = classifier.predict(X_val)
validation_accuracy = accuracy_score(y_val, validation_predictions)

print(f'Validation Accuracy: {round(validation_accuracy, 2)}')

Validation Accuracy: 0.76


# Question 5

In [42]:
features_of_interest = ['industry', 'employment_status', 'lead_score']

results = []

accuracy_differences = {}

for feature in features_of_interest:
    features_excluded = [f for f in features_of_interest if f != feature]
    
    train_dict_excluded = df_train[features_excluded].to_dict(orient="records")
    val_dict_excluded = df_val[features_excluded].to_dict(orient="records")

    X_train_excluded = vectorizer.fit_transform(X=train_dict_excluded)
    X_val_excluded = vectorizer.transform(X=val_dict_excluded)
    
    classifier.fit(X_train_excluded, y_train)
    y_val_pred_excluded = classifier.predict(X_val_excluded)
    
    accuracy_excluded = accuracy_score(y_val, y_val_pred_excluded)
    
    accuracy_difference = validation_accuracy - accuracy_excluded
    accuracy_differences[feature] = accuracy_difference
    
    results.append([feature, round(validation_accuracy, 4), round(accuracy_excluded, 4), round(accuracy_difference, 4)])

results_df = pd.DataFrame(results, columns=["Feature", "Accuracy With Feature", "Accuracy Without Feature", "Accuracy Difference"])

print("\nAccuracy Results:")
print(tabulate(results_df, headers='keys', tablefmt='pretty', floatfmt=".4f"))

least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
smallest_difference_value = accuracy_differences[least_useful_feature]

print(f'\nThe least useful feature is "{least_useful_feature}" with a difference of {smallest_difference_value:.4f}.')


Accuracy Results:
+---+-------------------+-----------------------+--------------------------+---------------------+
|   |      Feature      | Accuracy With Feature | Accuracy Without Feature | Accuracy Difference |
+---+-------------------+-----------------------+--------------------------+---------------------+
| 0 |     industry      |        0.7577         |          0.6143          |       0.1433        |
| 1 | employment_status |        0.7577         |          0.6519          |       0.1058        |
| 2 |    lead_score     |        0.7577         |          0.6246          |       0.1331        |
+---+-------------------+-----------------------+--------------------------+---------------------+

The least useful feature is "employment_status" with a difference of 0.1058.


# Question 6

In [43]:
regularization_strengths = [0.01, 0.1, 1, 10, 100]
best_accuracy_score = 0
optimal_C = None

for strength in regularization_strengths:
    
    classifier = LogisticRegression(solver='liblinear', C=strength, max_iter=1000, random_state=42)
    classifier.fit(X_train, y_train)
    
  
    validation_predictions = classifier.predict(X_val)
    

    current_accuracy = accuracy_score(y_val, validation_predictions)
    current_accuracy_rounded = round(current_accuracy, 3)
    
    print(f'C={strength}: Validation Accuracy = {current_accuracy_rounded}')
    
    
   
    if (current_accuracy_rounded > best_accuracy_score or 
        (current_accuracy_rounded == best_accuracy_score and strength < optimal_C)):
        best_accuracy_score = current_accuracy_rounded
        optimal_C = strength

print(f'\nOptimal regularization parameter: C = {optimal_C}')
print(f'Best validation accuracy: {best_accuracy_score:.3f}')

C=0.01: Validation Accuracy = 0.768
C=0.1: Validation Accuracy = 0.758
C=1: Validation Accuracy = 0.758
C=10: Validation Accuracy = 0.758
C=100: Validation Accuracy = 0.758

Optimal regularization parameter: C = 0.01
Best validation accuracy: 0.768
