In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"

df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [5]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [6]:
catcols = list(df.select_dtypes("object"))
numcols = list(df.select_dtypes(["int64", "float64"]))

In [7]:
catcols, numcols

(['lead_source', 'industry', 'employment_status', 'location'],
 ['number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score',
  'converted'])

In [8]:
for cat in catcols:
    df[cat] = df[cat].fillna('NA')

for num in numcols:
    df[num] = df[num].fillna(0.0)


In [9]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [10]:
# mode for industry

df["industry"].value_counts()

# retail has highest count

retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: industry, dtype: int64

In [11]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [12]:
df["interaction_count"].corr(df["lead_score"])


0.009888182496913077

In [13]:
df['number_of_courses_viewed'].corr(df['lead_score'])


-0.00487899835468127

In [14]:
df['number_of_courses_viewed'].corr(df['interaction_count'])

-0.023565222882888103

In [15]:
df['annual_income'].corr(df['interaction_count'])

0.027036472404814337

## test train split

In [16]:
from sklearn.model_selection import train_test_split

X = df.drop("converted", axis = 1)
y = df["converted"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.4, random_state=42) 
X_valid, X_test, y_valid, y_test = train_test_split( X_test, y_test, test_size = 0.5, random_state=42) 

In [18]:
total = len(X_train) + len(X_valid) + len(X_test)

print(f"\nTotal samples: {total}")
print(f"Train %: {len(X_train)/total:.2%}")
print(f"Valid %: {len(X_valid)/total:.2%}")
print(f"Test %: {len(X_test)/total:.2%}")



Total samples: 1462
Train %: 59.99%
Valid %: 19.97%
Test %: 20.04%


In [19]:
from sklearn.metrics import mutual_info_score
# print(catcols)

round(mutual_info_score(y_train, X_train['lead_source']), 3), \
round(mutual_info_score(y_train, X_train['industry']), 3), \
round(mutual_info_score(y_train, X_train['employment_status']), 3), \
round(mutual_info_score(y_train, X_train['location']), 3)



(0.028, 0.015, 0.018, 0.003)

In [20]:
# one hot encoding categorical cols 
from sklearn. preprocessing import OneHotEncoder

X_traine = pd.get_dummies(X_train, columns=catcols)
X_teste = pd.get_dummies(X_test, columns=catcols)
X_valide = pd.get_dummies(X_valid, columns=catcols)



In [21]:
X_valide.head()


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,lead_source_NA,lead_source_events,lead_source_organic_search,lead_source_paid_ads,lead_source_referral,lead_source_social_media,...,employment_status_student,employment_status_unemployed,location_NA,location_africa,location_asia,location_australia,location_europe,location_middle_east,location_north_america,location_south_america
886,1,63127.0,6,0.7,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
619,6,75389.0,2,0.04,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
548,2,66519.0,4,0.33,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1046,3,60910.0,3,0.32,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
618,0,63425.0,2,0.4,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [22]:
y_train.unique() 


array([1, 0])

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_traine, y_train)

y_pred = model.predict(X_valide)
base_acc = accuracy_score(y_valid, y_pred)

print(round(base_acc, 2))

0.74


In [24]:
acc_list = []
diff_list = []
cols = X_train.columns

for col in cols:
    X_train_new = X_train.drop(columns=[col])
    X_valid_new = X_valid.drop(columns=[col])
    X_test_new = X_test.drop(columns=[col])

    catcols = list(X_train_new.select_dtypes("object").columns)

    X_train_new = pd.get_dummies(X_train_new, columns=catcols)
    X_test_new = pd.get_dummies(X_test_new, columns=catcols)
    X_valid_new = pd.get_dummies(X_valid_new, columns=catcols)

    # same logistic regression model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_new, y_train)
    
    y_pred = model.predict(X_valid_new)
    acc = accuracy_score(y_valid, y_pred)
    
    # store accuracy and diff
    acc_list.append(acc)
    diff_list.append(base_acc - acc)

results = pd.DataFrame({
    "feature": cols,
    "accuracy": acc_list,
    "diff_from_base": diff_list
}).sort_values(by="diff_from_base", ascending=False)

print("Baseline accuracy:", base_acc)
print("\nTop features affecting accuracy:\n", results.head(15))
print("\nLeast useful feature:\n", results.tail(5))


Baseline accuracy: 0.7431506849315068

Top features affecting accuracy:
                     feature  accuracy  diff_from_base
6         interaction_count  0.674658        0.068493
2  number_of_courses_viewed  0.678082        0.065068
0               lead_source  0.729452        0.013699
1                  industry  0.743151        0.000000
5                  location  0.743151        0.000000
7                lead_score  0.743151        0.000000
4         employment_status  0.746575       -0.003425
3             annual_income  0.856164       -0.113014

Least useful feature:
              feature  accuracy  diff_from_base
1           industry  0.743151        0.000000
5           location  0.743151        0.000000
7         lead_score  0.743151        0.000000
4  employment_status  0.746575       -0.003425
3      annual_income  0.856164       -0.113014


In [25]:
Cvals = [0.01, 0.1, 1, 10, 100]
results = {}

for val in Cvals:
    model = LogisticRegression(solver='liblinear', C=val, max_iter=1000, random_state=42)
    model.fit(X_traine, y_train)

    y_pred = model.predict(X_valide)
    acc = accuracy_score(y_valid, y_pred)

    results[val] = acc

for key, value in results.items():
    print(f"C: {key}  accuracy: {value}")




C: 0.01  accuracy: 0.7431506849315068
C: 0.1  accuracy: 0.7431506849315068
C: 1  accuracy: 0.7431506849315068
C: 10  accuracy: 0.7431506849315068
C: 100  accuracy: 0.7431506849315068
