In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
INPUT_CSV = "course_lead_scoring.csv" 
TARGET_COL = "converted" 
RANDOM_SEED = 42

df = pd.read_csv(INPUT_CSV)
print("Data shape:", df.shape)
df.head(1)

Data shape: (1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1


In [13]:
df['converted'].unique()

array([1, 0])

In [15]:
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataset. Ganti TARGET_COL jika perlu.")

y = df[TARGET_COL].copy()
X = df.drop(columns=[TARGET_COL]).copy()

cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

X[cat_cols] = X[cat_cols].fillna('NA')
X[num_cols] = X[num_cols].fillna(0.0)

if 'industry' not in X.columns:
    print("Warning: 'industry' column not in data.")
else:
    mode_industry = X['industry'].mode()
    mode_val = mode_industry.iloc[0] if not mode_industry.empty else None
    print("Q1: mode of industry ->", mode_val)
    print("Choose among: NA, technology, healthcare, retail")

Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']
Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
Q1: mode of industry -> retail
Choose among: NA, technology, healthcare, retail


In [16]:
corr = X[num_cols].corr()
pairs = {
    "interaction_count_and_lead_score": ("interaction_count","lead_score"),
    "number_of_courses_viewed_and_lead_score": ("number_of_courses_viewed","lead_score"),
    "number_of_courses_viewed_and_interaction_count": ("number_of_courses_viewed","interaction_count"),
    "annual_income_and_interaction_count": ("annual_income","interaction_count")
}
pair_corr = {}
for k,(a,b) in pairs.items():
    if a in corr.index and b in corr.columns:
        pair_corr[k] = corr.loc[a,b]
    else:
        pair_corr[k] = np.nan

print("\nQ2: correlations for requested pairs (only these considered):")
for k,v in pair_corr.items():
    print(f"{k}: {v}")
valid = {k:abs(v) for k,v in pair_corr.items() if not pd.isna(v)}
if valid:
    best = max(valid.items(), key=lambda x: x[1])[0]
    mapping = {
        "interaction_count_and_lead_score":"interaction_count and lead_score",
        "number_of_courses_viewed_and_lead_score":"number_of_courses_viewed and lead_score",
        "number_of_courses_viewed_and_interaction_count":"number_of_courses_viewed and interaction_count",
        "annual_income_and_interaction_count":"annual_income and interaction_count"
    }
    print("Q2: Best correlated pair ->", mapping[best])
else:
    print("Q2: none of the pairs are present in the numeric columns.")


Q2: correlations for requested pairs (only these considered):
interaction_count_and_lead_score: 0.009888182496913105
number_of_courses_viewed_and_lead_score: -0.004878998354681265
number_of_courses_viewed_and_interaction_count: -0.023565222882888055
annual_income_and_interaction_count: 0.027036472404814396
Q2: Best correlated pair -> annual_income and interaction_count


In [17]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y if y.nunique()>1 else None)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_SEED, stratify=y_temp if y_temp.nunique()>1 else None)

print("\nSplit sizes: train, val, test ->", X_train.shape[0], X_val.shape[0], X_test.shape[0])

mi_scores = {}
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    vals = X_train[col].astype(str)
    le.fit(vals)
    le_dict[col] = le
    X_train_col_enc = le.transform(vals)
    score = mutual_info_classif(X_train_col_enc.reshape(-1,1), y_train, discrete_features=True, random_state=RANDOM_SEED)
    mi_scores[col] = float(score[0])

mi_scores_rounded = {k: round(v,2) for k,v in mi_scores.items()}
print("\nQ3: mutual information scores (train set, rounded 2 decimals):")
for k,v in mi_scores_rounded.items():
    print(k, ":", v)

options_q3 = ['industry','location','lead_source','employment_status']
present_options = {opt: mi_scores.get(opt, np.nan) for opt in options_q3}
print("\nQ3: selected option scores (unrounded):")
for k,v in present_options.items():
    print(k, "->", v)
valid_opts = {k:v for k,v in present_options.items() if not pd.isna(v)}
if valid_opts:
    best_q3 = max(valid_opts.items(), key=lambda x: x[1])[0]
    print("Q3: variable with biggest mutual info ->", best_q3)
else:
    print("Q3: none of the listed categorical columns present in data.")


Split sizes: train, val, test -> 876 293 293

Q3: mutual information scores (train set, rounded 2 decimals):
lead_source : 0.03
industry : 0.01
employment_status : 0.01
location : 0.0

Q3: selected option scores (unrounded):
industry -> 0.005958887591320947
location -> 0.0014310098448050215
lead_source -> 0.027718699139711492
employment_status -> 0.007695750218801714
Q3: variable with biggest mutual info -> lead_source


In [20]:
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

ohe.fit(X_train[cat_cols])

def prepare_matrix(X_df):
    X_num = X_df[num_cols].astype(float).values if num_cols else np.empty((len(X_df),0))
    X_cat = ohe.transform(X_df[cat_cols].astype(str)) if cat_cols else np.empty((len(X_df),0))
    return np.hstack([X_num, X_cat])

X_train_mat = prepare_matrix(X_train)
X_val_mat   = prepare_matrix(X_val)
X_test_mat  = prepare_matrix(X_test)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=RANDOM_SEED)
model.fit(X_train_mat, y_train)
y_val_pred = model.predict(X_val_mat)
acc_val = accuracy_score(y_val, y_val_pred)
acc_val_rounded = round(acc_val, 2)

print("\nQ4: validation accuracy (rounded 2 decimals) ->", acc_val_rounded)
print("Choose among: 0.64, 0.74, 0.84, 0.94")


Q4: validation accuracy (rounded 2 decimals) -> 0.73
Choose among: 0.64, 0.74, 0.84, 0.94


In [26]:
baseline_acc = acc_val
diffs = {}
for feat in num_cols + cat_cols:
    num_cols_reduced = [c for c in num_cols if c != feat]
    cat_cols_reduced = [c for c in cat_cols if c != feat]
    try:
        ohe_red = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        ohe_red = OneHotEncoder(handle_unknown='ignore', sparse=False)
    if cat_cols_reduced:
        ohe_red.fit(X_train[cat_cols_reduced].astype(str))

    def make_mat(Xdf):
        Xnum = Xdf[num_cols_reduced].astype(float).values if num_cols_reduced else np.empty((len(Xdf),0))
        Xcat = ohe_red.transform(Xdf[cat_cols_reduced].astype(str)) if cat_cols_reduced else np.empty((len(Xdf),0))
        return np.hstack([Xnum, Xcat])

    Xtr = make_mat(X_train)
    Xvl = make_mat(X_val)
    mdl = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=RANDOM_SEED)
    mdl.fit(Xtr, y_train)
    preds = mdl.predict(Xvl)
    acc = accuracy_score(y_val, preds)
    diffs[feat] = baseline_acc - acc

choices_q5 = ['industry','employment_status','lead_score']
valid_diff = {f:diffs.get(f, np.nan) for f in choices_q5 if f in diffs}

if valid_diff:
    smallest = min(valid_diff.items(), key=lambda x: x[1])[0]
    print(f"\nQ5: Feature with the smallest impact (smallest accuracy difference) → {smallest}")
else:
    print("Q5: Column not found in the dataset.")


Q5: Feature with the smallest impact (smallest accuracy difference) → employment_status


In [27]:
Cs = [0.01, 0.1, 1, 10, 100]
acc_by_C = {}
for c in Cs:
    mdl = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=RANDOM_SEED)
    mdl.fit(X_train_mat, y_train)
    preds = mdl.predict(X_val_mat)
    acc_by_C[c] = accuracy_score(y_val, preds)

acc_rounded = {c: round(v,3) for c,v in acc_by_C.items()}
best_acc = max(acc_by_C.values())
best_cs = [c for c,v in acc_by_C.items() if v == best_acc]
chosen_c = min(best_cs)

print("\nQ6: Accuracy for each C value (rounded to 3 decimals):")
for c, v in acc_rounded.items():
    print(f"C={c}: {v}")
print(f"Q6: Best C value → {chosen_c}")

print("\nAll steps completed successfully.")


Q6: Accuracy for each C value (rounded to 3 decimals):
C=0.01: 0.734
C=0.1: 0.73
C=1: 0.73
C=10: 0.73
C=100: 0.73
Q6: Best C value → 0.01

All steps completed successfully.
