# Data Wrangling for DATA SCIENCE PROJECT

In [1]:
import pandas as pd


filename = "master-5 2.csv"

In [2]:
# Python list headers containing name of headers

headers = [
    "Age",
    "Gender",
    "Daily Social Media Usage(hours)",
    "Number of Social Media Platforms",
    "Frequency of Posts",
    "Frequency of Checking Notifications",
    "Self Reported Addiction Score",
    "Cyberbullying Experience",
    "Self Esteem Score",
    "Sleep Quality",
    "Anxiety Score",
    "Social Media Fatigue Score",
    "Mental Health Status"
]

In [3]:
# Pandas method read_csv() to load the data
# Set the parameter "names"

original = pd.read_csv(filename)
df = original.copy()

In [4]:
# To see what the data set looks like, we'll use the head() method.
df.head()

  # the first five rows of the dataframe.

Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
0,35,Female,12.0,2,Often,Frequently,10.0,0,1.0,6.0,7.0,10,Poor
1,30,Female,12.0,2,Sometimes,Rarely,10.0,0,1.0,2.0,7.0,9,Poor
2,16,Female,12.0,2,Often,Occasionally,10.0,0,1.0,5.0,8.0,8,Poor
3,30,Female,12.0,2,Always,Occasionally,10.0,0,1.0,8.0,8.0,10,Poor
4,30,Male,12.0,2,Sometimes,Frequently,10.0,0,1.0,7.0,8.0,10,Poor


In [5]:
print(df.shape)
print(df.size)
print(df.dtypes)

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

(10765, 13)
139945
Age                                      int64
Gender                                  object
Daily Social Media Usage(hours)        float64
Number of Social Media Platforms         int64
Frequency of Posts                      object
Frequency of Checking Notifications     object
Self Reported Addiction Score          float64
Cyberbullying Experience                 int64
Self Esteem Score                      float64
Sleep Quality                          float64
Anxiety Score                          float64
Social Media Fatigue Score               int64
Mental Health Status                    object
dtype: object


In [6]:
import sklearn
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[['Never', 'Rarely', 'Sometimes', 'Often', 'Always']])
Category_encoded = ordinal_encoder.fit_transform(df[['Frequency of Posts']])
df['Frequency of Posts'] = Category_encoded
df.head()


Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
0,35,Female,12.0,2,3.0,Frequently,10.0,0,1.0,6.0,7.0,10,Poor
1,30,Female,12.0,2,2.0,Rarely,10.0,0,1.0,2.0,7.0,9,Poor
2,16,Female,12.0,2,3.0,Occasionally,10.0,0,1.0,5.0,8.0,8,Poor
3,30,Female,12.0,2,4.0,Occasionally,10.0,0,1.0,8.0,8.0,10,Poor
4,30,Male,12.0,2,2.0,Frequently,10.0,0,1.0,7.0,8.0,10,Poor


In [7]:
import sklearn
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# Define the desired categories
categories = [['Rarely', 'Occasionally', 'Frequently']]

# Create the OrdinalEncoder with handling for unknown values
ordinal_encoder1 = OrdinalEncoder(
    categories=categories,
    handle_unknown='use_encoded_value',
    unknown_value=np.nan  # Assign NaN to unknown values
)

# Fit and transform the 'Frequency of Checking Notifications' column
Category_encoded1 = ordinal_encoder1.fit_transform(df[['Frequency of Checking Notifications']])

# Assign the encoded values back to the DataFrame
df['Frequency of Checking Notifications'] = Category_encoded1

# Display the first few rows to check the changes
df.head()

Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
0,35,Female,12.0,2,3.0,2.0,10.0,0,1.0,6.0,7.0,10,Poor
1,30,Female,12.0,2,2.0,0.0,10.0,0,1.0,2.0,7.0,9,Poor
2,16,Female,12.0,2,3.0,1.0,10.0,0,1.0,5.0,8.0,8,Poor
3,30,Female,12.0,2,4.0,1.0,10.0,0,1.0,8.0,8.0,10,Poor
4,30,Male,12.0,2,2.0,2.0,10.0,0,1.0,7.0,8.0,10,Poor


In [8]:
import sklearn
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder2 = OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Excellent']])
Category_encoded = ordinal_encoder2.fit_transform(df[['Mental Health Status']])
df['Mental Health Status'] = Category_encoded
df.head()

Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
0,35,Female,12.0,2,3.0,2.0,10.0,0,1.0,6.0,7.0,10,0.0
1,30,Female,12.0,2,2.0,0.0,10.0,0,1.0,2.0,7.0,9,0.0
2,16,Female,12.0,2,3.0,1.0,10.0,0,1.0,5.0,8.0,8,0.0
3,30,Female,12.0,2,4.0,1.0,10.0,0,1.0,8.0,8.0,10,0.0
4,30,Male,12.0,2,2.0,2.0,10.0,0,1.0,7.0,8.0,10,0.0


In [9]:
df.head()

Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
0,35,Female,12.0,2,3.0,2.0,10.0,0,1.0,6.0,7.0,10,0.0
1,30,Female,12.0,2,2.0,0.0,10.0,0,1.0,2.0,7.0,9,0.0
2,16,Female,12.0,2,3.0,1.0,10.0,0,1.0,5.0,8.0,8,0.0
3,30,Female,12.0,2,4.0,1.0,10.0,0,1.0,8.0,8.0,10,0.0
4,30,Male,12.0,2,2.0,2.0,10.0,0,1.0,7.0,8.0,10,0.0


In [10]:
df.iloc[7100:7119]

Unnamed: 0,Age,Gender,Daily Social Media Usage(hours),Number of Social Media Platforms,Frequency of Posts,Frequency of Checking Notifications,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score,Mental Health Status
7100,25,Male,3.0,3,1.0,2.0,7.0,1,7.0,9.0,3.0,7,2.0
7101,40,Female,3.0,3,1.0,2.0,7.0,1,8.0,4.0,3.0,5,2.0
7102,35,Male,3.0,3,0.0,2.0,7.0,0,8.0,6.0,3.0,5,1.0
7103,45,Other,3.0,2,1.0,1.0,7.0,0,7.0,6.0,3.0,5,2.0
7104,50,Male,3.0,3,3.0,1.0,7.0,0,9.0,6.0,3.0,4,2.0
7105,45,Male,3.0,3,1.0,1.0,7.0,0,6.0,6.0,3.0,3,2.0
7106,40,Male,3.0,1,2.0,0.0,7.0,1,6.0,6.0,3.0,4,2.0
7107,25,Male,3.0,2,0.0,0.0,7.0,0,7.0,6.0,3.0,5,2.0
7108,18,Female,3.0,3,2.0,1.0,7.0,0,8.0,9.0,3.0,2,2.0
7109,16,Male,3.0,1,1.0,0.0,7.0,0,8.0,3.0,3.0,6,2.0


## **Preprocess Data**

Data is already mostly preprocessed. Just Preform some imputation and Standardization (Not Normalization)
Notice that the transformed data is actually stored in variables other than data (data_num_scaled, data_cat_scaled). This is because we will change the original data in the pipelining portion

In [11]:
from sklearn.impute import SimpleImputer


data_num = df[num_cols]
data_cat = df[cat_cols]

imputer_num = SimpleImputer(strategy='median')
data_num_imputed = imputer_num.fit_transform(data_num)

imputer_cat = SimpleImputer(strategy='most_frequent')
data_cat_imputed = imputer_cat.fit_transform(data_cat)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data_num_imputed)


In [13]:
df_num_scaled = pd.DataFrame(data_num_scaled, columns=num_cols)
df_cat_imputed = pd.DataFrame(data_cat_imputed, columns=cat_cols)

# Combine numeric and categorical data
df_final = pd.concat([df_num_scaled, df_cat_imputed], axis=1)

# Optional: Restore original column order (if necessary)
df_final = df_final[df.columns]

In [14]:

df_final.describe()

Unnamed: 0,Age,Daily Social Media Usage(hours),Number of Social Media Platforms,Self Reported Addiction Score,Cyberbullying Experience,Self Esteem Score,Sleep Quality,Anxiety Score,Social Media Fatigue Score
count,10765.0,10765.0,10765.0,10765.0,10765.0,10765.0,10765.0,10765.0,10765.0
mean,-9.83473e-17,8.448627000000001e-17,-1.108882e-16,3.379451e-16,-8.514632000000001e-17,-8.448627000000001e-17,1.866289e-16,0.0,-1.267294e-16
std,1.000046,1.000046,1.000046,1.000046,1.000046,1.000046,1.000046,1.000046,1.000046
min,-1.464559,-1.705424,-1.379016,-4.284401,-0.499129,-1.533301,-2.001234,-2.003861,-2.509077
25%,-0.8362073,-0.6827967,-0.6500639,-0.7905186,-0.499129,-0.7960482,-0.6548348,-0.663513,-0.8438015
50%,0.001595444,-0.3419208,0.07888796,0.3741089,-0.499129,-0.05879532,-0.2060351,-0.21673,-0.2887096
75%,0.5252222,0.6807068,0.8078398,0.9564226,-0.499129,0.6784575,0.6915643,0.676835,0.8214741
max,2.096102,2.385086,2.265744,0.9564226,2.00349,1.784337,2.037963,2.017184,2.48675


In [15]:
print("Missing values per column:\n", df_final.isnull().sum())
df_final.to_csv("master-6.csv", index=False)

Missing values per column:
 Age                                    0
Gender                                 0
Daily Social Media Usage(hours)        0
Number of Social Media Platforms       0
Frequency of Posts                     0
Frequency of Checking Notifications    0
Self Reported Addiction Score          0
Cyberbullying Experience               0
Self Esteem Score                      0
Sleep Quality                          0
Anxiety Score                          0
Social Media Fatigue Score             0
Mental Health Status                   0
dtype: int64


<hr>
<p>Copyright &copy; 2018 IBM Developer Skills Network. This notebook and its source code are released under the terms of the <a href="https://cognitiveclass.ai/mit-license/">MIT License</a>.</p>