<a href="https://colab.research.google.com/github/bigz4/COMP-9200-Assignment3/blob/main/COMP_9200_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Part 1 — Dataset Setup and Description

In [None]:
# PART 1 — Dataset Setup and Description
# Using GitHub Raw URL (public dataset link)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# --- GitHub Raw Dataset URL ---
dataset_url = "https://raw.githubusercontent.com/bigz4/COMP-9200-Assignment3/refs/heads/main/user_behavior_dataset.csv"

# --- Load dataset directly from GitHub ---
df = pd.read_csv(dataset_url)

print("Dataset Loaded Successfully")
print("Shape:", df.shape)
display(df.head())

# --- Dataset Overview ---
print("\nDataset Information:")
print(df.info())
print("\nMissing Values per Column:")
print(df.isnull().sum())


Part 2 — Preprocessing and Data Splitting

In [None]:

# Handle missing values: mean for numeric, mode for categorical
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Detect and remove outliers (IQR method)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
filtered_df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) |
                   (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
print(f"Removed {len(df) - len(filtered_df)} outliers")
df = filtered_df.copy()

# Normalize numeric features (0–1 range)
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Simulated train/test split for reproducibility validation
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Train/Test Split Sizes:", train_df.shape, test_df.shape)


Part 3 — Human Analytical Pipeline (Statistical Analysis)

In [None]:

# Select numeric columns only
numeric_cols = df.select_dtypes(include=['int64','float64']).columns

# Correlation analysis
corr = df[numeric_cols].corr()
print("\nCorrelation Matrix (numeric columns only):")
display(corr)

# Visualization
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='Blues')
plt.title("Correlation Heatmap - User Behavior Metrics")
plt.show()

# Key relationships
for pair in [('app_usage','battery_drain'),
             ('screen_on_time','app_usage'),
             ('age','app_usage')]:
    if all(col in numeric_cols for col in pair):
        print(f"{pair[0]} vs {pair[1]}: {corr.loc[pair[0],pair[1]]:.3f}")

# Hypothesis testing
def correlation_test(x,y):
    r,p = stats.pearsonr(df[x],df[y])
    return f"{x} vs {y}: r={r:.3f}, p={p:.5f}"

print("\nStatistical Significance Tests:")
for x,y in [('app_usage','battery_drain'),
            ('screen_on_time','app_usage'),
            ('age','app_usage')]:
    if all(col in numeric_cols for col in (x,y)):
        print(correlation_test(x,y))

# Optional: summarize categorical columns
print("\nCategorical Columns Summary:")
for col in df.select_dtypes(exclude=['int64','float64']).columns:
    print(f"{col}: {df[col].nunique()} unique values")


Part 4 — AI-Assisted Component, Bias, and Ethics

In [None]:

ai_summary = [
    "AI identified strong positive correlation between app usage and battery drain.",
    "Moderate negative correlation between age and app usage.",
    "AI noted potential OS-based bias between Android and iOS devices.",
    "AI analysis was efficient but required human validation."
]
print("\nAI-Assisted Insights:")
for line in ai_summary:
    print("-", line)

# Bias check example
if "os" in df.columns and "battery_drain" in df.columns:
    print("\nBias Check - Average Battery Drain by OS:")
    display(df.groupby("os")["battery_drain"].mean())

print("\nEthical Handling and Anonymization:")
print("Dataset contains no identifiable personal information and follows research ethics guidelines.")


Part 5 — Reproducibility, Bootstrap Stability, and Visualization

In [None]:

import platform, sklearn, scipy
print("Reproducibility Environment:")
print("Python:", platform.python_version())
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("scipy:", scipy.__version__)
print("scikit-learn:", sklearn.__version__)

# Bootstrap correlation stability
def bootstrap_corr(data, col1, col2, n_iter=1000):
    rs = []
    for _ in range(n_iter):
        sample = data.sample(frac=1, replace=True)
        r,_ = stats.pearsonr(sample[col1], sample[col2])
        rs.append(r)
    return np.mean(rs), np.percentile(rs,[2.5,97.5])

print("\nBootstrap Stability (95% Confidence Intervals):")
for pair in [('app_usage','battery_drain'),
             ('screen_on_time','app_usage'),
             ('age','app_usage')]:
    if all(col in numeric_cols for col in pair):
        mean_r, ci = bootstrap_corr(df, pair[0], pair[1])
        print(f"{pair[0]} vs {pair[1]}: mean r={mean_r:.3f}, 95% CI={ci}")

# Visualization
sns.pairplot(df[numeric_cols])
plt.suptitle("Feature Relationships", y=1.02)
plt.show()


Part 6 — Export Results (Appendix Submission)

In [None]:

# Export cleaned dataset
df.to_csv("cleaned_user_behavior_dataset.csv", index=False)
corr.to_csv("correlation_matrix.csv")

summary_path = "analysis_summary.txt"
with open(summary_path, "w") as f:
    f.write("=== AI-Assisted Analysis of User Behavior Dataset ===\n")
    f.write(f"Total records: {len(df)}\n\n")

    f.write("Key Correlations:\n")
    for pair in [('app_usage','battery_drain'),
                 ('screen_on_time','app_usage'),
                 ('age','app_usage')]:
        if all(col in numeric_cols for col in pair):
            f.write(f"{pair[0]} vs {pair[1]}: {corr.loc[pair[0],pair[1]]:.3f}\n")

    f.write("\nBootstrap 95% Confidence Intervals:\n")
    for pair in [('app_usage','battery_drain'),
                 ('screen_on_time','app_usage'),
                 ('age','app_usage')]:
        if all(col in numeric_cols for col in pair):
            mean_r, ci = bootstrap_corr(df, pair[0], pair[1])
            f.write(f"{pair[0]} vs {pair[1]}: mean r={mean_r:.3f}, 95% CI={ci}\n")

    f.write("\nAI Summary:\n")
    for line in ai_summary:
        f.write(f"- {line}\n")

    f.write("\nBias and Ethics:\nDataset anonymized; compliant with research ethics.\n")

print("Files saved:")
!ls -1 cleaned_user_behavior_dataset.csv correlation_matrix.csv analysis_summary.txt
