In [2]:
# Exercise 1

import pandas as pd
import statsmodels.api as sm
from google.colab import files

# Step 1: Upload dataset
print("Please upload binary.dta file...")
uploaded = files.upload()

# Step 2: Load uploaded dataset
data = pd.read_stata("binary.dta")

print("Dataset head:")
print(data.head())

# Step 3: Define variables
X = data[['gre', 'gpa', 'rank']]
y = data['admit']

# Step 4: Add constant for statsmodels
X_const = sm.add_constant(X)

# Step 5: Logistic regression model
model = sm.Logit(y, X_const).fit()
print(model.summary())


Please upload binary.dta file...


Saving binary.dta to binary.dta
Dataset head:
   admit    gre   gpa  rank
0    0.0  380.0  3.61   3.0
1    1.0  660.0  3.67   3.0
2    1.0  800.0  4.00   1.0
3    1.0  640.0  3.19   4.0
4    0.0  520.0  2.93   4.0
Optimization terminated successfully.
         Current function value: 0.574302
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      396
Method:                           MLE   Df Model:                            3
Date:                Wed, 10 Sep 2025   Pseudo R-squ.:                 0.08107
Time:                        05:19:11   Log-Likelihood:                -229.72
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 8.207e-09
                 coef    std err          z      P>|

In [17]:
!pip install scikit-learn pandas openpyxl



In [18]:
# Exercise 2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load dataset
file_path = "/content/datatab.xlsx"
data = pd.read_excel(file_path)

# Encode categorical variables
le_gender = LabelEncoder()
data["Gender"] = le_gender.fit_transform(data["Gender"])  # Fit on entire dataset
le_smoker = LabelEncoder()
data["Smoker status"] = le_smoker.fit_transform(data["Smoker status"])  # Fit on entire dataset
le_disease = LabelEncoder()
data["Disease"] = le_disease.fit_transform(data["Disease"]) # diseased=1, not diseased=0 # Fit on entire dataset


X = data[["Age", "Gender", "Smoker status"]] # features
y = data["Disease"] # target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))

# Example: Predict susceptibility for a new person
# Age=30, Female, Smoker
new_person = pd.DataFrame([[30, le_gender.transform(["Female"])[0], le_smoker.transform(["Smoker"])[0]]],
                          columns=["Age", "Gender", "Smoker status"])
prediction = model.predict(new_person)
print("\nPrediction for new person:", le_disease.inverse_transform(prediction)[0]) # Use inverse_transform to get original label

Confusion Matrix:
 [[4 2]
 [0 2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.50      1.00      0.67         2

    accuracy                           0.75         8
   macro avg       0.75      0.83      0.73         8
weighted avg       0.88      0.75      0.77         8


Accuracy: 0.75

Prediction for new person: not diseased


In [19]:
# Exercise 3

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
data = pd.read_excel("/content/retail.xlsx")

# Encode categorical variables
le = LabelEncoder()
data["Purchasing behaviour"] = le.fit_transform(data["Purchasing behaviour"])  # Buy now=1, else=0
data["Gender"] = le.fit_transform(data["Gender"])  # male/female → 0/1

# Features and target
X = data[["Gender", "Age", "Time spent in online shop"]]
y = data["Purchasing behaviour"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Example prediction for a new customer (female, age=30, time=25 mins)
new_customer = pd.DataFrame([[le.transform(["female"])[0], 30, 25]],
                            columns=["Gender", "Age", "Time spent in online shop"])
print("\nNew Customer Prediction:", "Will Buy" if model.predict(new_customer)[0] == 1 else "Will Not Buy")


Accuracy: 0.4

Confusion Matrix:
 [[0 2 0]
 [0 1 0]
 [0 1 1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.25      1.00      0.40         1
           2       1.00      0.50      0.67         2

    accuracy                           0.40         5
   macro avg       0.42      0.50      0.36         5
weighted avg       0.45      0.40      0.35         5


New Customer Prediction: Will Buy


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
