## Preprocessing Steps

### Analysing data

In [28]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Central_Asia_Social_Media_MentalHealth.csv')
df.head()

Unnamed: 0,Country,Age Group,Gender,Urban/Rural,Daily SM Usage (hrs),Most Used SM Platform,Frequency of SM Use,Likes Received (per post),Comments Received (per post),Shares Received (per post),Peer Comparison Frequency (1-10),Social Anxiety Level (1-10),Socioeconomic Status,Education Level,State,Body Image Impact (1-10),Sleep Quality Impact (1-10),Self Confidence Impact (1-10),Cyberbullying Experience (1-10),Anxiety Levels (1-10)
0,Uzbekistan,18-25,Female,Rural,6.3,Instagram,Weekly,302,56,32,1,6,Middle,Master's,Bukhara,8,9,5,2,7
1,Uzbekistan,18-25,Male,Rural,1.2,Facebook,Weekly,294,80,25,1,6,High,Bachelor's,Namangan,5,1,9,5,3
2,Uzbekistan,36-45,Male,Rural,4.8,Instagram,Monthly,288,20,8,6,3,Low,High School,Samarkand,9,9,4,5,8
3,Turkmenistan,36-45,Female,Rural,1.4,Twitter,Daily,169,1,86,2,7,High,Bachelor's,Dashoguz,5,5,6,7,3
4,Kazakhstan,56-65,Female,Rural,0.8,Twitter,Weekly,371,31,6,5,3,High,Master's,Karaganda,10,7,6,1,10


In [3]:
df.columns

Index(['Country', 'Age Group', 'Gender', 'Urban/Rural', 'Daily SM Usage (hrs)',
       'Most Used SM Platform', 'Frequency of SM Use',
       'Likes Received (per post)', 'Comments Received (per post)',
       'Shares Received (per post)', 'Peer Comparison Frequency (1-10)',
       'Social Anxiety Level (1-10)', 'Socioeconomic Status',
       'Education Level', 'State', 'Body Image Impact (1-10)',
       'Sleep Quality Impact (1-10)', 'Self Confidence Impact (1-10)',
       'Cyberbullying Experience (1-10)', 'Anxiety Levels (1-10)'],
      dtype='object')

In [4]:
df.isnull().sum()

Country                             0
Age Group                           0
Gender                              0
Urban/Rural                         0
Daily SM Usage (hrs)                0
Most Used SM Platform               0
Frequency of SM Use                 0
Likes Received (per post)           0
Comments Received (per post)        0
Shares Received (per post)          0
Peer Comparison Frequency (1-10)    0
Social Anxiety Level (1-10)         0
Socioeconomic Status                0
Education Level                     0
State                               0
Body Image Impact (1-10)            0
Sleep Quality Impact (1-10)         0
Self Confidence Impact (1-10)       0
Cyberbullying Experience (1-10)     0
Anxiety Levels (1-10)               0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 20 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Country                           200000 non-null  object 
 1   Age Group                         200000 non-null  object 
 2   Gender                            200000 non-null  object 
 3   Urban/Rural                       200000 non-null  object 
 4   Daily SM Usage (hrs)              200000 non-null  float64
 5   Most Used SM Platform             200000 non-null  object 
 6   Frequency of SM Use               200000 non-null  object 
 7   Likes Received (per post)         200000 non-null  int64  
 8   Comments Received (per post)      200000 non-null  int64  
 9   Shares Received (per post)        200000 non-null  int64  
 10  Peer Comparison Frequency (1-10)  200000 non-null  int64  
 11  Social Anxiety Level (1-10)       200000 non-null  i

### Data Cleaning

In [7]:
key_features = [
    'Daily SM Usage (hrs)',
    'Likes Received (per post)',
    'Comments Received (per post)',
    'Shares Received (per post)',
    'Peer Comparison Frequency (1-10)',
    'Cyberbullying Experience (1-10)',
    'Body Image Impact (1-10)',
    'Self Confidence Impact (1-10)',
    'Sleep Quality Impact (1-10)',
    'Anxiety Levels (1-10)'
]

df = df[key_features]

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [9]:
def create_anxiety_class(x):
    if x <= 3:
        return 0
    elif x <= 6:
        return 1
    else:
        return 2

In [10]:
df['Anxiety_Class'] = df['Anxiety Levels (1-10)'].apply(create_anxiety_class)

In [11]:
X = df.drop(['Anxiety Levels (1-10)', 'Anxiety_Class'], axis=1)
y = df['Anxiety_Class']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Classification Task: Determining if a person is at risk of experiencing high anxiety due to social media use

### DECISION TREE

In [17]:
decision_tree_model = DecisionTreeClassifier(
    max_depth=4,
    class_weight='balanced',
    random_state=42
)

decision_tree_model.fit(X_train, y_train)
y_pred_dt = decision_tree_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))


Accuracy: 0.310125

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.48      0.37     11982
           1       0.30      0.42      0.35     11996
           2       0.40      0.10      0.16     16022

    accuracy                           0.31     40000
   macro avg       0.33      0.33      0.29     40000
weighted avg       0.34      0.31      0.28     40000



### RANDOM FOREST

In [18]:
random_forest_model = RandomForestClassifier(
    n_estimators=150,
    max_depth=8,
    class_weight='balanced',
    random_state=42
)

In [19]:
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.334825

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.33      0.31     11982
           1       0.30      0.33      0.31     11996
           2       0.41      0.34      0.37     16022

    accuracy                           0.33     40000
   macro avg       0.33      0.33      0.33     40000
weighted avg       0.34      0.33      0.34     40000



### KNN

In [21]:
knn_model = KNeighborsClassifier(n_neighbors=7)

knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

In [22]:
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))

Accuracy: 0.33655

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.31      0.30     11982
           1       0.30      0.28      0.29     11996
           2       0.40      0.40      0.40     16022

    accuracy                           0.34     40000
   macro avg       0.33      0.33      0.33     40000
weighted avg       0.34      0.34      0.34     40000



In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

y_pred_lin = linear_model.predict(X_test_scaled)

print("MSE:", mean_squared_error(y_test, y_pred_lin))
print("R2 Score:", r2_score(y_test, y_pred_lin))

MSE: 0.6899016331602081
R2 Score: -3.816732895733921e-06
