# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [3]:
data = pd.read_csv("/kaggle/input/streaming-service-data/Streaming.csv")
data.info()  # Inspect the dataset

FileNotFoundError: [Errno 2] No such file or directory: '/Users/akash/Downloads/ML Project/Streaming.csv'

In [None]:
data.head()  # Display the first few rows

# Data Cleaning

In [None]:
# Step 3: Data Cleaning
# Handle missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Satisfaction_Score'].fillna(data['Satisfaction_Score'].median(), inplace=True)

# Fix outliers in 'Monthly_Spend' by replacing them with the median
valid_range_min = 15
valid_range_max = 70
valid_median = data[(data['Monthly_Spend'] >= valid_range_min) & (data['Monthly_Spend'] <= valid_range_max)]['Monthly_Spend'].median()
data.loc[data['Monthly_Spend'] < valid_range_min, 'Monthly_Spend'] = valid_median
data.loc[data['Monthly_Spend'] > valid_range_max, 'Monthly_Spend'] = valid_median


# Drop irrelevant columns
data.drop(columns=['Customer_ID'], inplace=True)

# Data Validation check

In [None]:
# Check for missing values
print("Missing Values Summary:")
print(data.isnull().sum())

# Percentage of missing values in each column
missing_percent = data.isnull().mean() * 100
print("\nPercentage of Missing Values:")
print(missing_percent)

# Check for duplicate rows
print("\nNumber of duplicate rows:", data.duplicated().sum())

# Check data types of all columns
print("\nData Types:")
print(data.dtypes)

# Check for unique values in categorical columns
categorical_cols = ['Gender', 'Region', 'Payment_Method']  # Replace with your categorical column names
for col in categorical_cols:
    print(f"\nUnique values in {col}: {data[col].unique()}")

# Statistical summary for numeric columns to check for potential outliers
print("\nStatistical Summary of Numeric Columns:")
print(data.describe())

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ['Gender', 'Region', 'Payment_Method']
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Exploratory Data Analysis (EDA)

In [None]:
# Step 4: Exploratory Data Analysis (EDA)
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Distribution plots
sns.histplot(data['Monthly_Spend'], kde=True)
plt.title('Monthly Spend Distribution After Outlier Handling')
plt.show()

In [None]:
# Count Plot for Churned
ax = sns.countplot(x='Churned', data=data)
plt.title('Churned Count')

# Annotate each bar
for bar in ax.patches:
    ax.annotate(f'{int(bar.get_height())}', 
                (bar.get_x() + bar.get_width() / 2, bar.get_height()), 
                ha='center', va='bottom', fontsize=10)
plt.show()

# Feature Selection

In [None]:
# Step 5: Feature Selection using RFE
# Separate features and target variables for regression and classification
X = data.drop(columns=['Monthly_Spend', 'Churned'])
y_reg = data['Monthly_Spend']
y_class = data['Churned']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Recursive Feature Elimination (RFE) for regression
lr = LinearRegression()
rfe_reg = RFE(lr, n_features_to_select=4)
rfe_reg.fit(X_scaled, y_reg)
selected_features_reg = X.columns[rfe_reg.support_]

# Recursive Feature Elimination (RFE) for classification
clf = LogisticRegression()
rfe_class = RFE(clf, n_features_to_select=5)
rfe_class.fit(X_scaled, y_class)
selected_features_class = X.columns[rfe_class.support_]

print("Selected features for regression:", selected_features_reg)
print("Selected features for classification:", selected_features_class)

# Train-Test Split

In [None]:
# Step 6: Train-Test Split
X_reg = data[selected_features_reg]
X_class = data[selected_features_class]

X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

# Regression

In [None]:
# Step 7: Train Models and Evaluate
# Regression: Linear Regression and Random Forest Regressor
# Train Models
lr_model = LinearRegression()
rf_reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)

lr_model.fit(X_reg_train, y_reg_train)
rf_reg_model.fit(X_reg_train, y_reg_train)

# Predictions
y_reg_pred_lr = lr_model.predict(X_reg_test)
y_reg_pred_rf = rf_reg_model.predict(X_reg_test)

# Evaluation Metrics
lr_r2 = r2_score(y_reg_test, y_reg_pred_lr)
rf_r2 = r2_score(y_reg_test, y_reg_pred_rf)

print("Linear Regression R^2:", lr_r2)
print("Random Forest Regressor R^2:", rf_r2)

# Comparing Regression Metrics For Linear and Random Forest Regressor 

In [None]:
lr_model = LinearRegression()
rf_reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)

lr_model.fit(X_reg_train, y_reg_train)
rf_reg_model.fit(X_reg_train, y_reg_train)

# Predictions
y_reg_pred_lr = lr_model.predict(X_reg_test)
y_reg_pred_rf = rf_reg_model.predict(X_reg_test)

# Evaluation Metrics (code remains the same)
lr_r2 = r2_score(y_reg_test, y_reg_pred_lr)
rf_r2 = r2_score(y_reg_test, y_reg_pred_rf)

print("Linear Regression R^2:", lr_r2)
print("Random Forest Regressor R^2:", rf_r2)

# Visualize the Metrics 
plt.figure(figsize=(8, 6))
plt.bar(['Linear Regression', 'Random Forest'], [lr_r2, rf_r2])
plt.xlabel('Regression Model')
plt.ylabel('R^2 Score')
plt.title('Comparison of R^2 Scores')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Print the R^2 values directly on the bars
for i, v in enumerate([lr_r2, rf_r2]):
    plt.text(i, v + 0.02, str(round(v, 2)), ha='center')  # Adjust y-offset as needed

plt.tight_layout()
plt.show()

# Classification

In [None]:
# Classification: Logistic Regression and Random Forest Classifier
log_model = LogisticRegression()
rf_class_model = RandomForestClassifier(random_state=42)
log_model.fit(X_class_train, y_class_train)
rf_class_model.fit(X_class_train, y_class_train)
# k-NN classifiers with different k values
knn_class_model_5 = KNeighborsClassifier(n_neighbors=5)
knn_class_model_5.fit(X_class_train, y_class_train)
knn_class_model_9 = KNeighborsClassifier(n_neighbors=9)
knn_class_model_9.fit(X_class_train, y_class_train)
y_class_pred_log = log_model.predict(X_class_test)
y_class_pred_rf = rf_class_model.predict(X_class_test)
y_class_pred_knn_5 = knn_class_model_5.predict(X_class_test)
y_class_pred_knn_9 = knn_class_model_9.predict(X_class_test)

print("Logistic Regression Accuracy:", accuracy_score(y_class_test, y_class_pred_log))
print("Random Forest Classifier Accuracy:", accuracy_score(y_class_test, y_class_pred_rf))
print("k-NN (k=5) Classifier Accuracy:", accuracy_score(y_class_test, y_class_pred_rf))
print("k-NN (k=9) Classifier Accuracy:", accuracy_score(y_class_test, y_class_pred_rf))

# Classification reports
print("Logistic Regression Classification Report:\n", classification_report(y_class_test, y_class_pred_log))
print("Random Forest Classification Report:\n", classification_report(y_class_test, y_class_pred_rf))
print("k-NN (k=5) Classification Report:\n",classification_report(y_class_test, y_class_pred_knn_5))
print("k-NN (k=9) Classification Report:\n",classification_report(y_class_test, y_class_pred_knn_9))

In [None]:
# Confusion matrices
sns.heatmap(confusion_matrix(y_class_test, y_class_pred_log), annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

sns.heatmap(confusion_matrix(y_class_test, y_class_pred_rf), annot=True, fmt='d', cmap='Greens')
plt.title('Random Forest Confusion Matrix')
plt.show()
cmap = sns.diverging_palette(255, 0, s=100, n=9, as_cmap=True)
pink_blue_cmap = LinearSegmentedColormap.from_list("pink_blue", ["#6495ed", "#ffffff","#ff1493"])

sns.heatmap(confusion_matrix(y_class_test, y_class_pred_knn_5), annot=True, fmt='d', cmap=pink_blue_cmap)
plt.title('k-NN Classification (k=5) Confusion Matrix ')
plt.show()

sns.heatmap(confusion_matrix(y_class_test, y_class_pred_knn_9), annot=True, fmt='d', cmap=pink_blue_cmap)
plt.title('k-NN Classification (k=9) Confusion Matrix ')
plt.show()

In [None]:
 # Binarize the target variable 
y_class_test_bin = label_binarize(y_class_test, classes=[0, 1])
 # Initialize the k-NN classifiers with k=5 and k=9
 knn_class_model_5 = KNeighborsClassifier(n_neighbors=5)
 knn_class_model_5.fit(X_class_train, y_class_train)
 knn_class_model_9 = KNeighborsClassifier(n_neighbors=9)
 knn_class_model_9.fit(X_class_train, y_class_train)
 # Initializing a dictionary for the models
 models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_class_model,
    "k-NN (k=5)": knn_class_model_5,
    "k-NN (k=9)": knn_class_model_9
 }
 # Plotting ROC curve for each model
 plt.figure(figsize=(10, 8))
 for model_name, model in models.items():
    y_prob = model.predict_proba(X_class_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_class_test_bin, y_prob)
    roc_auc = auc(fpr, tpr)
    
    # Plot the ROC curve for the current model
    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.2f})')
 # Plot the diagonal line
 plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
 plt.title('Receiver Operating Characteristic (ROC) Curve')
 plt.legend(loc="lower right")
 plt.show()