#Load and Prepare Data


In [None]:
import pandas as pd

df = pd.read_csv('/content/Students Social Media Addiction.csv')
print("Dataset loaded successfully. Displaying the first 5 rows:")
print(df.head())

print("\nDataFrame Info:")
df.info()

Dataset loaded successfully. Displaying the first 5 rows:
   Student_ID  Age  Gender Academic_Level     Country  Avg_Daily_Usage_Hours  \
0           1   19  Female  Undergraduate  Bangladesh                    5.2   
1           2   22    Male       Graduate       India                    2.1   
2           3   20  Female  Undergraduate         USA                    6.0   
3           4   18    Male    High School          UK                    3.0   
4           5   21    Male       Graduate      Canada                    4.5   

  Most_Used_Platform Affects_Academic_Performance  Sleep_Hours_Per_Night  \
0          Instagram                          Yes                    6.5   
1            Twitter                           No                    7.5   
2             TikTok                          Yes                    5.0   
3            YouTube                           No                    7.0   
4           Facebook                          Yes                    6.0   

   M

Based on the info() output, there are no missing values in the dataset.

---

The next step is to create the 'Country_Group' column by categorizing countries as 'Banned Countries' or 'Other Countries'


In [None]:
banned_countries = ['UK', 'France', 'Saudi Arabia', 'China', 'Australia']
df['Country_Group'] = df['Country'].apply(lambda x: 'Banned Countries' if x in banned_countries else 'Other Countries')

print("\nUpdated DataFrame with 'Country_Group' column (first 5 rows):")
print(df.head())


Updated DataFrame with 'Country_Group' column (first 5 rows):
   Student_ID  Age  Gender Academic_Level     Country  Avg_Daily_Usage_Hours  \
0           1   19  Female  Undergraduate  Bangladesh                    5.2   
1           2   22    Male       Graduate       India                    2.1   
2           3   20  Female  Undergraduate         USA                    6.0   
3           4   18    Male    High School          UK                    3.0   
4           5   21    Male       Graduate      Canada                    4.5   

  Most_Used_Platform Affects_Academic_Performance  Sleep_Hours_Per_Night  \
0          Instagram                          Yes                    6.5   
1            Twitter                           No                    7.5   
2             TikTok                          Yes                    5.0   
3            YouTube                           No                    7.0   
4           Facebook                          Yes                    6.0   


# Analyzing Gender Differences and Building the Model


In [None]:
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1. Convert 'Affects_Academic_Performance' to numerical
df['Affects_Academic_Performance_Numerical'] = df['Affects_Academic_Performance'].map({'Yes': 1, 'No': 0})

metrics = [
    'Avg_Daily_Usage_Hours',
    'Affects_Academic_Performance_Numerical',
    'Sleep_Hours_Per_Night',
    'Mental_Health_Score',
    'Conflicts_Over_Social_Media',
    'Addicted_Score'
]

print("\n--- Gender Differences Analysis ---")
for metric in metrics:
    print(f"\nAnalyzing: {metric}")

    # a. Calculate overall mean
    overall_mean = df[metric].mean()
    print(f"  Overall Mean: {overall_mean:.2f}")

    # b. Separate data by gender
    male_data = df[df['Gender'] == 'Male'][metric]
    female_data = df[df['Gender'] == 'Female'][metric]

    # c. Perform independent samples t-test
    t_statistic, p_value = stats.ttest_ind(male_data, female_data, nan_policy='omit')
    print(f"  T-test: t-statistic = {t_statistic:.2f}, p-value = {p_value:.3f}")

    # d. Report significant differences
    if p_value < 0.05:
        print("  Result: Significant difference between Male and Female groups.")
    else:
        print("  Result: No significant difference between Male and Female groups.")

    # e. Calculate and compare gender group means
    male_mean = male_data.mean()
    female_mean = female_data.mean()
    print(f"  Male Mean: {male_mean:.2f}")
    print(f"  Female Mean: {female_mean:.2f}")

    if male_mean > overall_mean:
        print(f"  Male group average is greater than the overall average for {metric}.")
    elif male_mean < overall_mean:
        print(f"  Male group average is less than the overall average for {metric}.")
    else:
        print(f"  Male group average is equal to the overall average for {metric}.")

    if female_mean > overall_mean:
        print(f"  Female group average is greater than the overall average for {metric}.")
    elif female_mean < overall_mean:
        print(f"  Female group average is less than the overall average for {metric}.")
    else:
        print(f"  Female group average is equal to the overall average for {metric}.")

# 3. Prepare data for classification model
X = df[[
    'Avg_Daily_Usage_Hours',
    'Affects_Academic_Performance_Numerical',
    'Sleep_Hours_Per_Night',
    'Mental_Health_Score',
    'Conflicts_Over_Social_Media',
    'Addicted_Score'
]]

# Encode 'Gender' numerically
le = LabelEncoder()
y = le.fit_transform(df['Gender'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n--- Classification Model for Gender Prediction ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# 4. Build and evaluate the classification model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

print("Gender Label Encoding:")
for i, label in enumerate(le.classes_):
    print(f"  {label}: {i}")



--- Gender Differences Analysis ---

Analyzing: Avg_Daily_Usage_Hours
  Overall Mean: 4.92
  T-test: t-statistic = -1.96, p-value = 0.051
  Result: No significant difference between Male and Female groups.
  Male Mean: 4.83
  Female Mean: 5.01
  Male group average is less than the overall average for Avg_Daily_Usage_Hours.
  Female group average is greater than the overall average for Avg_Daily_Usage_Hours.

Analyzing: Affects_Academic_Performance_Numerical
  Overall Mean: 0.64
  T-test: t-statistic = -0.66, p-value = 0.512
  Result: No significant difference between Male and Female groups.
  Male Mean: 0.63
  Female Mean: 0.65
  Male group average is less than the overall average for Affects_Academic_Performance_Numerical.
  Female group average is greater than the overall average for Affects_Academic_Performance_Numerical.

Analyzing: Sleep_Hours_Per_Night
  Overall Mean: 6.87
  T-test: t-statistic = 1.25, p-value = 0.213
  Result: No significant difference between Male and Female g

# Analyzing Country Group Differences and Building the Model



In [None]:
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = [
    'Avg_Daily_Usage_Hours',
    'Affects_Academic_Performance_Numerical',
    'Sleep_Hours_Per_Night',
    'Mental_Health_Score',
    'Conflicts_Over_Social_Media',
    'Addicted_Score'
]

print("\n--- Country Group Differences Analysis ---")
for metric in metrics:
    print(f"\nAnalyzing: {metric}")

    # a. Calculate overall mean
    overall_mean = df[metric].mean()
    print(f"  Overall Mean: {overall_mean:.2f}")

    # b. Separate data by country group
    banned_data = df[df['Country_Group'] == 'Banned Countries'][metric]
    other_data = df[df['Country_Group'] == 'Other Countries'][metric]

    # c. Perform independent samples t-test
    t_statistic, p_value = stats.ttest_ind(banned_data, other_data, nan_policy='omit')
    print(f"  T-test: t-statistic = {t_statistic:.2f}, p-value = {p_value:.3f}")

    # d. Report significant differences
    if p_value < 0.05:
        print("  Result: Significant difference between Banned Countries and Other Countries groups.")
    else:
        print("  Result: No significant difference between Banned Countries and Other Countries groups.")

    # e. Calculate and compare group means
    banned_mean = banned_data.mean()
    other_mean = other_data.mean()
    print(f"  Banned Countries Mean: {banned_mean:.2f}")
    print(f"  Other Countries Mean: {other_mean:.2f}")

    if banned_mean > overall_mean:
        print(f"  Banned Countries group average is greater than the overall average for {metric}.")
    elif banned_mean < overall_mean:
        print(f"  Banned Countries group average is less than the overall average for {metric}.")
    else:
        print(f"  Banned Countries group average is equal to the overall average for {metric}.")

    if other_mean > overall_mean:
        print(f"  Other Countries group average is greater than the overall average for {metric}.")
    elif other_mean < overall_mean:
        print(f"  Other Countries group average is less than the overall average for {metric}.")
    else:
        print(f"  Other Countries group average is equal to the overall average for {metric}.")

# 3. Prepare data for classification model
X = df[metrics]

# Encode 'Country_Group' numerically
le_country = LabelEncoder()
y = le_country.fit_transform(df['Country_Group'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n--- Classification Model for Country Group Prediction ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# 4. Build and evaluate the classification model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

print("Country Group Label Encoding:")
for i, label in enumerate(le_country.classes_):
    print(f"  {label}: {i}")


--- Country Group Differences Analysis ---

Analyzing: Avg_Daily_Usage_Hours
  Overall Mean: 4.92
  T-test: t-statistic = -1.54, p-value = 0.125
  Result: No significant difference between Banned Countries and Other Countries groups.
  Banned Countries Mean: 4.71
  Other Countries Mean: 4.94
  Banned Countries group average is less than the overall average for Avg_Daily_Usage_Hours.
  Other Countries group average is greater than the overall average for Avg_Daily_Usage_Hours.

Analyzing: Affects_Academic_Performance_Numerical
  Overall Mean: 0.64
  T-test: t-statistic = -4.48, p-value = 0.000
  Result: Significant difference between Banned Countries and Other Countries groups.
  Banned Countries Mean: 0.42
  Other Countries Mean: 0.67
  Banned Countries group average is less than the overall average for Affects_Academic_Performance_Numerical.
  Other Countries group average is greater than the overall average for Affects_Academic_Performance_Numerical.

Analyzing: Sleep_Hours_Per_Nigh

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



The previous step produced an UndefinedMetricWarning because the classification model failed to predict samples for one of the classes, leading to an undefined precision score for that class. To handle this and explicitly reflect this behavior in the metrics, I will set zero_division=0 in the precision_score, recall_score, and f1_score calculations. This will assign a score of 0.0 to any class for which no predictions were made.



In [None]:
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = [
    'Avg_Daily_Usage_Hours',
    'Affects_Academic_Performance_Numerical',
    'Sleep_Hours_Per_Night',
    'Mental_Health_Score',
    'Conflicts_Over_Social_Media',
    'Addicted_Score'
]

print("\n--- Country Group Differences Analysis ---")
for metric in metrics:
    print(f"\nAnalyzing: {metric}")

    # a. Calculate overall mean
    overall_mean = df[metric].mean()
    print(f"  Overall Mean: {overall_mean:.2f}")

    # b. Separate data by country group
    banned_data = df[df['Country_Group'] == 'Banned Countries'][metric]
    other_data = df[df['Country_Group'] == 'Other Countries'][metric]

    # c. Perform independent samples t-test
    t_statistic, p_value = stats.ttest_ind(banned_data, other_data, nan_policy='omit')
    print(f"  T-test: t-statistic = {t_statistic:.2f}, p-value = {p_value:.3f}")

    # d. Report significant differences
    if p_value < 0.05:
        print("  Result: Significant difference between Banned Countries and Other Countries groups.")
    else:
        print("  Result: No significant difference between Banned Countries and Other Countries groups.")

    # e. Calculate and compare group means
    banned_mean = banned_data.mean()
    other_mean = other_data.mean()
    print(f"  Banned Countries Mean: {banned_mean:.2f}")
    print(f"  Other Countries Mean: {other_mean:.2f}")

    if banned_mean > overall_mean:
        print(f"  Banned Countries group average is greater than the overall average for {metric}.")
    elif banned_mean < overall_mean:
        print(f"  Banned Countries group average is less than the overall average for {metric}.")
    else:
        print(f"  Banned Countries group average is equal to the overall average for {metric}.")

    if other_mean > overall_mean:
        print(f"  Other Countries group average is greater than the overall average for {metric}.")
    elif other_mean < overall_mean:
        print(f"  Other Countries group average is less than the overall average for {metric}.")
    else:
        print(f"  Other Countries group average is equal to the overall average for {metric}.")

# 3. Prepare data for classification model
X = df[metrics]

# Encode 'Country_Group' numerically
le_country = LabelEncoder()
y = le_country.fit_transform(df['Country_Group'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n--- Classification Model for Country Group Prediction ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# 4. Build and evaluate the classification model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

print("Country Group Label Encoding:")
for i, label in enumerate(le_country.classes_):
    print(f"  {label}: {i}")


--- Country Group Differences Analysis ---

Analyzing: Avg_Daily_Usage_Hours
  Overall Mean: 4.92
  T-test: t-statistic = -1.54, p-value = 0.125
  Result: No significant difference between Banned Countries and Other Countries groups.
  Banned Countries Mean: 4.71
  Other Countries Mean: 4.94
  Banned Countries group average is less than the overall average for Avg_Daily_Usage_Hours.
  Other Countries group average is greater than the overall average for Avg_Daily_Usage_Hours.

Analyzing: Affects_Academic_Performance_Numerical
  Overall Mean: 0.64
  T-test: t-statistic = -4.48, p-value = 0.000
  Result: Significant difference between Banned Countries and Other Countries groups.
  Banned Countries Mean: 0.42
  Other Countries Mean: 0.67
  Banned Countries group average is less than the overall average for Affects_Academic_Performance_Numerical.
  Other Countries group average is greater than the overall average for Affects_Academic_Performance_Numerical.

Analyzing: Sleep_Hours_Per_Nigh

# Analyze Social Media Groupings Differences and Build Model


In [None]:
print("Unique social media platforms:")
print(df['Most_Used_Platform'].unique())

Unique social media platforms:
['Instagram' 'Twitter' 'TikTok' 'YouTube' 'Facebook' 'LinkedIn' 'Snapchat'
 'LINE' 'KakaoTalk' 'VKontakte' 'WhatsApp' 'WeChat']


Now that the unique social media platforms have been identified, the next step is to perform ANOVA tests for each of the six specified metrics across these platforms. If an ANOVA test shows a significant difference, a post-hoc Tukey's HSD test will be conducted to identify which specific platform groups differ. Finally, a classification model will be built to predict the social media grouping using these metrics, and its performance will be evaluated.



In [None]:
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

metrics = [
    'Avg_Daily_Usage_Hours',
    'Affects_Academic_Performance_Numerical',
    'Sleep_Hours_Per_Night',
    'Mental_Health_Score',
    'Conflicts_Over_Social_Media',
    'Addicted_Score'
]

print("\n--- Social Media Groupings Differences Analysis ---")
for metric in metrics:
    print(f"\nAnalyzing: {metric}")

    # a. Calculate overall mean
    overall_mean = df[metric].mean()
    print(f"  Overall Mean: {overall_mean:.2f}")

    # b. Group data by 'Most_Used_Platform'
    groups = df.groupby('Most_Used_Platform')[metric].apply(list)

    # Ensure there are at least two groups to perform ANOVA
    if len(groups) < 2:
        print(f"  Not enough groups to perform ANOVA for {metric}.")
        continue

    # c. Perform ANOVA test
    f_statistic, p_value = stats.f_oneway(*groups)
    print(f"  ANOVA: F-statistic = {f_statistic:.2f}, p-value = {p_value:.3f}")

    # d. Report significant differences and perform post-hoc if significant
    if p_value < 0.05:
        print("  Result: Significant difference between social media platform groups.")
        # e. Perform post-hoc Tukey's HSD test
        tukey_result = pairwise_tukeyhsd(endog=df[metric], groups=df['Most_Used_Platform'], alpha=0.05)
        print("\n  Tukey's HSD Post-Hoc Test:")
        print(tukey_result)
    else:
        print("  Result: No significant difference between social media platform groups.")

    # f. Calculate and compare group means
    group_means = df.groupby('Most_Used_Platform')[metric].mean()
    print("\n  Group Means compared to Overall Mean:")
    for platform, mean_val in group_means.items():
        print(f"    {platform} Mean: {mean_val:.2f}", end=" ")
        if mean_val > overall_mean:
            print("(Greater than overall average)")
        elif mean_val < overall_mean:
            print("(Less than overall average)")
        else:
            print("(Equal to overall average)")

# 3. Prepare data for classification model
X = df[metrics]

# Encode 'Most_Used_Platform' numerically
le_platform = LabelEncoder()
y = le_platform.fit_transform(df['Most_Used_Platform'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n--- Classification Model for Social Media Grouping Prediction ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# 4. Build and evaluate the classification model
model = LogisticRegression(max_iter=2000, random_state=42) # Increased max_iter for convergence
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

print("\nSocial Media Platform Label Encoding:")
for i, label in enumerate(le_platform.classes_):
    print(f"  {label}: {i}")



--- Social Media Groupings Differences Analysis ---

Analyzing: Avg_Daily_Usage_Hours
  Overall Mean: 4.92
  ANOVA: F-statistic = 28.67, p-value = 0.000
  Result: Significant difference between social media platform groups.

  Tukey's HSD Post-Hoc Test:
   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
  group1    group2  meandiff p-adj   lower   upper  reject
----------------------------------------------------------
 Facebook Instagram    0.365 0.0728 -0.0147  0.7447  False
 Facebook KakaoTalk   0.2177 0.9999 -0.8243  1.2596  False
 Facebook      LINE  -1.2573 0.0047 -2.2993 -0.2154   True
 Facebook  LinkedIn  -1.9883    0.0 -2.8017 -1.1748   True
 Facebook  Snapchat    0.585 0.7533 -0.4198  1.5898  False
 Facebook    TikTok   0.8388    0.0  0.4222  1.2554   True
 Facebook   Twitter   0.3627 0.8702 -0.3389  1.0642  False
 Facebook VKontakte  -0.2573 0.9997 -1.2993  0.7846  False
 Facebook    WeChat   0.4527 0.9175 -0.4896  1.3949  False
 Facebook  WhatsApp   1.9686    0.0  