In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url)

In [3]:
# Inspect the dataset
print("Dataset columns:", df.columns.tolist())
print("Missing values:\n", df.isna().sum())

Dataset columns: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']
Missing values:
 lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [4]:
# Handle missing values for numerical columns
#for col in df.select_dtypes(include=['float64', 'int64']).columns:
#    df[col] = df[col].fillna(0)  # Fill with 0, as in Question 6

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

print("\nCategorical columns:", categorical_cols.tolist())
print("Numerical columns:", numerical_cols.tolist())

# Replace missing values
# Categorical: fill with 'NA'
#for col in categorical_cols:
#    df[col] = df[col].fillna('NA')

# Numerical: fill with 0.0
#for col in numerical_cols:
#    df[col] = df[col].fillna(0.0)


# Data preparation: Fill missing values
# Categorical: fill with 'NA'
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna('NA')

# Numerical: fill with 0.0
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    df[col] = df[col].fillna(0.0)

# Verify missing values after preprocessing
print("\nMissing values after preprocessing:\n", df.isna().sum())



Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']

Missing values after preprocessing:
 lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [5]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [6]:
# Inspect the dataset
print("Dataset columns:", df.columns.tolist())
print("\nMissing values in 'industry':", df['industry'].isna().sum())
print("\nValue counts for 'industry':\n", df['industry'].value_counts(dropna=False))

Dataset columns: ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score', 'converted']

Missing values in 'industry': 0

Value counts for 'industry':
 industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


In [7]:
# Compute the mode of the 'industry' column (raw data)
industry_mode = df['industry'].mode()[0]
print("\nMode of 'industry' (raw data):", industry_mode)


Mode of 'industry' (raw data): retail


In [8]:
# Fill missing values in 'industry' with 'NA' (per data preparation)
df['industry_filled'] = df['industry'].fillna('NA')

# Compute the mode after filling missing values
industry_mode_filled = df['industry_filled'].mode()[0]
print("Mode of 'industry' (after filling with 'NA'):", industry_mode_filled)

Mode of 'industry' (after filling with 'NA'): retail


In [9]:
# Define the pairs to check
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]
# Compute Pearson correlations
correlations = {}
for col1, col2 in pairs:
    corr = df[col1].corr(df[col2])
    abs_corr = abs(corr)
    correlations[f"{col1} and {col2}"] = (corr, abs_corr)
    print(f"Correlation between {col1} and {col2}: {corr:.3f} (Absolute: {abs_corr:.3f})")

Correlation between interaction_count and lead_score: 0.010 (Absolute: 0.010)
Correlation between number_of_courses_viewed and lead_score: -0.005 (Absolute: 0.005)
Correlation between number_of_courses_viewed and interaction_count: -0.024 (Absolute: 0.024)
Correlation between annual_income and interaction_count: 0.027 (Absolute: 0.027)


In [10]:
# Find the pair with the highest absolute correlation
max_pair = max(correlations, key=lambda x: correlations[x][1])
max_abs_corr = correlations[max_pair][1]

print(f"\nPair with highest absolute correlation: {max_pair}")
print(f"Absolute correlation value: {max_abs_corr:.3f}")
print(f"Answer: {max_pair}")


Pair with highest absolute correlation: annual_income and interaction_count
Absolute correlation value: 0.027
Answer: annual_income and interaction_count


In [11]:
# Verify required columns
required_cols = ['industry', 'location', 'lead_source', 'employment_status']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"Error: Missing columns {missing_cols}")
    exit()

In [12]:
# Assume target is 'converted' (binary classification)
target = 'converted'
if target not in df.columns:
    print(f"Error: Target '{target}' not found. Available columns:", df.columns.tolist())
    exit()

In [13]:
# Encode categorical features for MI calculation
X = df[required_cols].copy()
y = df[target]

In [14]:
# Label encode categorical features
label_encoders = {}
for col in required_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [15]:
# Compute mutual information scores
mi_scores = mutual_info_classif(X, y, random_state=42)
mi_dict = {col: score for col, score in zip(required_cols, mi_scores)}


In [16]:
# Print MI scores
for col, score in mi_dict.items():
    print(f"MI score for {col}: {score:.3f}")

# Find the feature with the highest MI score
max_feature = max(mi_dict, key=mi_dict.get)
max_mi = mi_dict[max_feature]

print(f"\nFeature with highest MI: {max_feature}")
print(f"Highest MI score: {max_mi:.3f}")
print(f"Answer: {max_feature}")

MI score for industry: 0.015
MI score for location: 0.000
MI score for lead_source: 0.037
MI score for employment_status: 0.033

Feature with highest MI: lead_source
Highest MI score: 0.037
Answer: lead_source


In [17]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [18]:
# Assume target is 'converted' (binary classification)
target = 'converted'
if target not in df_encoded.columns:
    print(f"Error: Target '{target}' not found. Available columns:", df_encoded.columns.tolist())
    exit()

In [19]:
# Define features and target
X = df_encoded.drop(target, axis=1)
y = df_encoded[target]

In [20]:
# Split the data: 60% train, 20% validation, 20% test with seed 42
X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=42)

In [21]:
# Debug: Check shapes
#print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")


# Debug: Check for non-numerical columns and missing values
print(f"\nX_train dtypes:\n", X_train.dtypes)
print(f"X_train missing values:\n", X_train.isna().sum())



X_train dtypes:
 number_of_courses_viewed             int64
annual_income                      float64
interaction_count                    int64
lead_score                         float64
industry_filled                     object
lead_source_events                    bool
lead_source_organic_search            bool
lead_source_paid_ads                  bool
lead_source_referral                  bool
lead_source_social_media              bool
industry_education                    bool
industry_finance                      bool
industry_healthcare                   bool
industry_manufacturing                bool
industry_other                        bool
industry_retail                       bool
industry_technology                   bool
employment_status_employed            bool
employment_status_self_employed       bool
employment_status_student             bool
employment_status_unemployed          bool
location_africa                       bool
location_asia                       

In [22]:
# Scale numerical features
#X_train_processed = pd.get_dummies(X_train, columns=['customer_type'], drop_first=True)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

ValueError: could not convert string to float: 'retail'

In [None]:
# Debug: Check shapes
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)


In [None]:
# Predict on validation set
y_pred = model.predict(X_val_scaled)

In [None]:
# Compute accuracy on validation set
baseline_accuracy = accuracy_score(y_val, y_pred)
accuracy_rounded = round(baseline_accuracy, 2)
print(f"Accuracy on validation set: {accuracy_rounded}")
print(f"Answer: {accuracy_rounded}")

In [None]:
# Features to eliminate
features_to_test = ['industry', 'employment_status', 'lead_score']

# Store accuracy differences
accuracy_differences = {}

# Feature elimination
for feature in features_to_test:
    # Identify columns to drop
    if feature in ['industry', 'employment_status']:
        # Drop all one-hot encoded columns for the categorical feature
        drop_cols = [col for col in X.columns if col.startswith(f"{feature}_")]
    else:
        # Drop the single numerical column
        drop_cols = [feature]
    
    if not drop_cols:
        print(f"Warning: No columns found for {feature}")
        continue
    
    # Create new feature set
    X_train_temp = X_train.drop(columns=drop_cols)
    X_val_temp = X_val.drop(columns=drop_cols)
    
    # Scale temporary features
    scaler_temp = StandardScaler()
    X_train_temp_scaled = scaler_temp.fit_transform(X_train_temp)
    X_val_temp_scaled = scaler_temp.transform(X_val_temp)
    
    # Train model without the feature
    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_train_temp_scaled, y_train)
    y_pred_temp = model_temp.predict(X_val_temp_scaled)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    
    # Calculate difference
    difference = baseline_accuracy - accuracy_temp
    accuracy_differences[feature] = difference
    print(f"Accuracy without {feature}: {accuracy_temp:.6f}, Difference: {difference:.6f}")

# Find feature with smallest absolute difference
min_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
min_difference = accuracy_differences[min_feature]

print(f"\nFeature with smallest absolute difference: {min_feature}")
print(f"Smallest absolute difference: {abs(min_difference):.6f}")
print(f"Answer: {min_feature}")

In [None]:
# Define C values to test
C_values = [0.01, 0.1, 1, 10, 100]

# Store accuracies
accuracies = {}

In [None]:
# Train models for each C value
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_pred)
    accuracy_rounded = round(accuracy, 3)
    accuracies[C] = accuracy_rounded
    print(f"Accuracy for C={C}: {accuracy_rounded}")

# Find the best C (smallest C if tied)
best_C = min((C for C, acc in accuracies.items() if acc == max(accuracies.values())), key=lambda x: x)
best_accuracy = accuracies[best_C]

print(f"\nBest C: {best_C}")
print(f"Best accuracy: {best_accuracy}")
print(f"Answer: {best_C}")