In [1]:
import pandas as pd

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-15 04:51:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 04:51:35 (11.8 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [3]:
df = pd.read_csv("course_lead_scoring.csv")

In [4]:
df.head(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
def cat_impute(df):

    df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').fillna('NA')
    return df

def num_impute(df):

    df[df.select_dtypes(include=['float','int']).columns] = df.select_dtypes(include=['float','int']).fillna(0.0)
    return df

In [10]:
df = cat_impute(df)
df = num_impute(df)

In [11]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [15]:
df['industry'].value_counts()  #retail have most frequent observation

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [22]:
num_df = df.select_dtypes(include=['int64', 'float64'])

In [17]:
corr_matrix = num_df.corr()

In [20]:
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [24]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [23]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.drop('converted', axis=1)
y = df['converted']

In [27]:
X_train,X_temp, y_train,y_temp = train_test_split(X,y,test_size = 0.40, random_state = 42 )
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50,random_state=42)

In [29]:
from sklearn.feature_selection import mutual_info_classif

In [30]:
cat_feat = X_train.select_dtypes(include='object')

In [31]:
X_train_encoded = pd.get_dummies(cat_feat,drop_first=True)

In [36]:
from sklearn.feature_selection import mutual_info_classif

for col in ['location', 'industry', 'lead_source', 'employment_status']:
   
    X_col = pd.get_dummies(X_train[[col]], drop_first=True)
    
    
    mi_score = mutual_info_classif(X_col, y_train, discrete_features=True, random_state=42)
    
    
    total_mi = round(mi_score.sum(), 2)
    
    print(f"{col}: {total_mi}")


location: 0.0
industry: 0.02
lead_source: 0.03
employment_status: 0.02


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'  # keep numeric columns as they are
)


In [38]:


model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=1000,
    random_state=42
)


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

clf.fit(X_train, y_train)


y_pred = clf.predict(X_val)




In [39]:
acc = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(acc, 2))

Validation Accuracy: 0.74


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(exclude='object').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

model = LogisticRegression(
    solver='liblinear',
    C=1.0,
    max_iter=1000,
    random_state=42
)


pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('model', model)])
pipe.fit(X_train, y_train)
baseline_acc = accuracy_score(y_val, pipe.predict(X_val))
print("Baseline accuracy:", baseline_acc)


feature_diffs = {}

for col in X_train.columns:
    X_train_reduced = X_train.drop(columns=[col])
    X_val_reduced = X_val.drop(columns=[col])
    
    # Rebuild preprocessing for reduced features
    cat_cols_reduced = X_train_reduced.select_dtypes(include='object').columns
    
    preprocessor_reduced = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols_reduced)
        ],
        remainder='passthrough'
    )
    
    pipe_reduced = Pipeline(steps=[('preprocessor', preprocessor_reduced),
                                   ('model', model)])
    
    pipe_reduced.fit(X_train_reduced, y_train)
    acc_reduced = accuracy_score(y_val, pipe_reduced.predict(X_val_reduced))
    
    feature_diffs[col] = acc_reduced - baseline_acc

diff_df = pd.DataFrame.from_dict(feature_diffs, orient='index', columns=['Accuracy_Diff'])
diff_df = diff_df.sort_values(by='Accuracy_Diff')
print(diff_df)


Baseline accuracy: 0.7431506849315068
                          Accuracy_Diff
interaction_count             -0.068493
number_of_courses_viewed      -0.065068
lead_source                   -0.013699
industry                       0.000000
location                       0.000000
lead_score                     0.000000
employment_status              0.003425
annual_income                  0.113014


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Prepare categorical columns for one-hot encoding
cat_cols = X_train.select_dtypes(include='object').columns

# Dictionary to store validation accuracies
val_accuracies = {}

for C in C_values:
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ],
        remainder='passthrough'
    )
    
    # Logistic Regression with current C
    model = LogisticRegression(
        solver='liblinear',
        C=C,
        max_iter=1000,
        random_state=42
    )
    
    # Full pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
    
    # Fit model
    clf.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = clf.predict(X_val)
    
    # Compute accuracy
    acc = round(accuracy_score(y_val, y_val_pred), 3)
    
    val_accuracies[C] = acc

# Display results
for C, acc in val_accuracies.items():
    print(f"C={C}: Validation Accuracy={acc}")

# Find best C 
best_C = min([C for C, acc in val_accuracies.items() if acc == max(val_accuracies.values())])
print(f"\nBest C: {best_C}")


C=0.01: Validation Accuracy=0.743
C=0.1: Validation Accuracy=0.743
C=1: Validation Accuracy=0.743
C=10: Validation Accuracy=0.743
C=100: Validation Accuracy=0.743

Best C: 0.01
