# Pre Work Installations for Kaggle API
### API Key Creation is needed. (https://www.kaggle.com/settings)
### Commands below installs kaggle, moves api key into needed directory, downloads needed datasheet and moves it into data folder.
### *Run only Once

In [None]:
!pip install kaggle

In [None]:
#Create API Key here
#https://www.kaggle.com/settings
#Scroll down to API and create new key, should download a json file in downloads folder.

#Windows
!cp "%USERPROFILE%/Downloads/kaggle.json" "%USERPROFILE%/.kaggle/kaggle.json"

#Linux
# !cp "~/Downloads/kaggle.json" "~/.kaggle/kaggle.json"

In [None]:
#https://www.kaggle.com/datasets/souvikahmed071/social-media-and-mental-health
!kaggle datasets download -d "souvikahmed071/social-media-and-mental-health"

In [None]:
#Windows
!mkdir "%USERPROFILE%/.kaggle"

#Linux/Mac
# !mkdir ~/.kaggle

In [None]:
#Install Unzip command
!pip install unzip

In [None]:
#Unzip downloaded datasheet into newly created data folder
!unzip social-media-and-mental-health.zip -d data/

In [None]:
#Do not Need
!rm data/Correlation_between_Social_Media_use_and_Mental_Health.ipynb data/README.md

# Begin Here

In [None]:
#importing dependencies 
import hvplot.pandas
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

In [None]:
main_df = pd.read_csv("data/smmh.csv")
main_df

In [None]:
def relabel_averageTime(row):
    if row['8. What is the average time you spend on social media every day?'] in ['Less than an Hour','Between 1 and 2 hours', 'Between 2 and 3 hours']:
        return '0-3 hours'
    elif row['8. What is the average time you spend on social media every day?'] in ['Between 3 and 4 hours', 'Between 4 and 5 hours']:
        return '3-5 hours'
    elif row['8. What is the average time you spend on social media every day?'] in ['More than 5 hours']:
        return '5+ hours'

#Run the apply method to df for each row calling relabel function
main_df['Average Time on Social Media'] = main_df.apply(lambda row: relabel_averageTime(row), axis=1)

#Remove Old Column
del main_df['8. What is the average time you spend on social media every day?']



In [None]:
#Keep only rows who use social media
main_df = main_df.loc[main_df["6. Do you use social media?"]=="Yes", :].copy()


In [None]:
column_list = main_df.columns.tolist()
print(column_list)

In [None]:
#main_df

Age Groups Surveyed

In [None]:
ages_surveyed = main_df.iloc[:, 1].value_counts()
#print(ages_surveyed.head(10))
print(ages_surveyed.tail(10))

In [None]:
# Initial bar chart showing age distribution of those surveyed
plt.bar(ages_surveyed.index.values,ages_surveyed.values)

# Rotate drug names for readability
plt.xticks(rotation=0)

# X and Y axis names
plt.xlabel("Ages of Those Surveyed")
plt.ylabel("Total per Age")
plt.show()


In [None]:
# Custom age ranges
bins = [0, 9, 19, 29, 39, 49, 59, float('inf')]

# Labels for the age groups
labels = ['0-9', '10-19', '20-29', '30-39','40-49','50-59', '60-95']  

main_df['Age Groups'] = pd.cut(main_df['1. What is your age?'], bins=bins, labels=labels,include_lowest=True)

# Count the number of individuals in each age group
age_group_counts = main_df['Age Groups'].value_counts()

# List ascending age groups 
age_group_counts=age_group_counts.sort_index()

# Plotting the bar chart
age_group_counts.plot(kind='bar')

# Adding some personality to the chart
plt.xlabel('Age Groups')
plt.ylabel('Count')
plt.title('Age Group Distribution')
plt.xticks(rotation=0)

# Display the chart
plt.show()

Genders Surveyed

In [None]:
# Catalogue all genders surveyed
genders_surveyed = set(main_df['2. Gender'])
print(genders_surveyed)

In [None]:
# Create an "Others" group so results fall under "Male", "Female", or "Other"
main_df.replace('unsure ','Others', inplace=True)
main_df.replace('There are others???','Others', inplace=True)
main_df.replace('NB','Others', inplace=True)
main_df.replace('Trans','Others', inplace=True)
main_df.replace('Non binary ','Others', inplace=True)
main_df.replace('Nonbinary ','Others', inplace=True)
main_df.replace('Non-binary','Others', inplace=True)

genders_surveyed = set(main_df['2. Gender'])
print(genders_surveyed)

In [None]:
# Counts for each gender category
gender_counts = main_df['2. Gender'].value_counts()
gender_counts

In [None]:
genders_surveyed = main_df['2. Gender'].value_counts()

# Plotting the bar chart
plt.bar(gender_counts.index, gender_counts)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Survey Gender Distribution')

# Rotating x-axis labels 
plt.xticks(rotation=45)

# Adding percentagess to bar chart
total = gender_counts.sum()
for i, count in enumerate(gender_counts):
    percentage = count / total * 100
    plt.text(i, count, f'{percentage:.1f}%', ha='center', va='bottom')

plt.show()

In [None]:
main_df

# Collection of App Usage vs Mental Health

## Does a specific SM platform or collection of platforms lead to more issues than others? 

- The most common Social Media Platform among the surveyed data were Facebook, Instagram, YouTube.
- Facebook and Instagram can be associated for comparison that may be leading to negative emotions.
- We do not have specific data regarding how a distinct platform makes the recipient feel. Deducing the data to one recorded platform from the top 3 did not lead to enough information for a conclusion.
- Users in this sample tend to use more than one platform along with the top three most popular recorded.

## Whether a collection of app platform usage leads to more of an impact on mental health?
- The distribution among the dataset had on average 3-5 social media platforms. More specifically on average 4 platforms.
- The Average Total Frequency Score saw an increase of 12% from the 1-2 social media platform group to 3-5 group which is a minor increase. Which may suggest multiple platforms can lead to more of an impact on attention throughout the day when engaging in multiple platforms. 
- The 1-2 Social Media Platform group also had on average an hour less spent on social media than the 3-5 and 6+ platform groups.

In [None]:
#Remove Timestamp; do not really need
socialApps_df = main_df.iloc[:, 1:]
socialApps_df.head()

In [None]:
#Get Number of Social Apps into a List
appsList = socialApps_df.iloc[:, 6]

#Creating a variable to store # amount of apps
numberOfApps = []

#Creating a variable to store list of the split result string list
listOfApps = []

for app in appsList:
    listOfApps.append(app.split(";"))

In [None]:
#Add Number of Apps to DF

listOfNumberApps = [len(x) for x in listOfApps]

#Add to social apps df
socialApps_df['Number of Apps'] = listOfNumberApps

In [None]:
#Rename columns to respective type of question.
socialApps_df.rename(columns = {'9. How often do you find yourself using Social media without a specific purpose?':'ADHD Q1',
                       '10. How often do you get distracted by Social media when you are busy doing something?':'ADHD Q2',
                       "11. Do you feel restless if you haven't used Social media in a while?":'Anxiety Q1',
                       '12. On a scale of 1 to 5, how easily distracted are you?':'ADHD Q3',
                       '13. On a scale of 1 to 5, how much are you bothered by worries?':'Anxiety Q2',
                       '14. Do you find it difficult to concentrate on things?':'ADHD Q4',
                       '15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?':'Self Esteem Q1',
                       '17. How often do you look to seek validation from features of social media?':'Self Esteem Q2',
                       '18. How often do you feel depressed or down?':'Depression Q1',
                       '19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?':'Depression Q2',
                       '20. On a scale of 1 to 5, how often do you face issues regarding sleep?':'Depression Q3' },inplace=True)


In [None]:
# Custom app ranges
appBins = [1, 2, 5, 20]

# Labels for the app groups
appLabels = ['1-2', '3-5', '6+']  

#Bin the new groups
socialApps_df['App Groups'] = pd.cut(socialApps_df['Number of Apps'], bins=appBins, labels=appLabels,include_lowest=True)

In [None]:
socialApps_df.head()

In [None]:
#Create a chart to detail time spent on social media vs amount of platforms
statSummary = socialApps_df.groupby('Average Time on Social Media')

averageNumberOfApps = statSummary['Number of Apps'].mean()

plt.bar(averageNumberOfApps.index, averageNumberOfApps, edgecolor='black')
plt.xlabel("Average Time Spent on Social Media")
plt.ylabel("Average Amount of Apps")
plt.title('Average Time Spent vs Average Amount of Platforms')
plt.yticks(np.arange(0,5,step=0.5))
plt.show()
averageNumberOfApps

In [None]:
from collections import Counter

#Flatten List of lists ex: ([["A"], ["B"], ["C"]] = ["A", "B", "C"])
appsTotalList = [item for sublist in listOfApps for item in sublist]

#Count total amount of recorded platforms
recordedAppsTotal = Counter(appsTotalList)

#Create into DF
recordedAppsTotal_df = pd.DataFrame.from_dict(recordedAppsTotal, orient='index', columns=["Total"])

In [None]:
#Plot the recorded results of platforms
plt.bar(recordedAppsTotal_df.index, recordedAppsTotal_df['Total'], width=0.6, align='center', color='blue', edgecolor='black')
plt.xticks(rotation=45)
plt.title('Total Amount for Platforms Used')
plt.ylabel('Total Amount Reported')
plt.xlabel('Social Media Platforms')
plt.show()
recordedAppsTotal_df

In [None]:
#Create a column for ADHD Total questions, 4 Question total (20 points Max)
socialApps_df['ADHD Total Score'] = socialApps_df['ADHD Q1'] + socialApps_df['ADHD Q2'] + socialApps_df['ADHD Q3'] + socialApps_df['ADHD Q4']

#Create a column for Self Esteem Total questions, 2 Question total (10 points Max)
socialApps_df['Self Esteem Total Score'] = socialApps_df['Self Esteem Q1'] + socialApps_df['Self Esteem Q2']

#Create a column for Anxiety Total questions, 2 Question total (10 points Max)
socialApps_df['Anxiety Total Score'] = socialApps_df['Anxiety Q1'] + socialApps_df['Anxiety Q2']

#Create a column for Depression total questions, 3 Question total (15 points Max)
socialApps_df['Depression Total Score'] = socialApps_df['Depression Q1'] + socialApps_df['Depression Q2'] + socialApps_df['Depression Q3']

#Create a column for Total Amount of questions, 11 Question total (55 points Max)
socialApps_df['Total Score'] = socialApps_df['ADHD Total Score'] + socialApps_df['Self Esteem Total Score'] + socialApps_df['Anxiety Total Score'] + socialApps_df['Depression Total Score']


In [None]:
#Filter for specific platform of the top 3, Facebook
filtered_list = []
for index, row in socialApps_df.iterrows():
    temp = row['7. What social media platforms do you commonly use?'].split()
    if 'Facebook' in temp and len(temp) == 1:
        filtered_list.append(row)
filtered_df = pd.DataFrame(filtered_list)
filtered_df.head()

In [None]:
#Filter for specific platform of the top 3, Instagram
filtered_list_in = []
for index, row in socialApps_df.iterrows():
    temp = row['7. What social media platforms do you commonly use?'].split()
    if 'Instagram' in temp and len(temp) == 1:
        filtered_list_in.append(row)
filteredIn_df = pd.DataFrame(filtered_list_in)
filteredIn_df.head()

In [None]:
#Filter for specific platform of the top 3, YouTube
filtered_list_yt = []
for index, row in socialApps_df.iterrows():
    temp = row['7. What social media platforms do you commonly use?'].split()
    if 'YouTube' in temp and len(temp) == 1:
        filtered_list_yt.append(row)
filteredYT_df = pd.DataFrame(filtered_list_yt)
filteredYT_df.head()

In [None]:
#Plot the individual platform average scores.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)

ax1.boxplot(filtered_df['Total Score'])
ax2.boxplot(filteredIn_df['Total Score'])
ax3.boxplot(filteredYT_df['Total Score'])
ax1.set_ylabel('Average Frequency Score from Recipients')
ax2.set_xlabel('Platform')
ax2.set_title('Platform vs Average Total Score')
ax1.set_xticklabels(['Facebook'])
ax2.set_xticklabels(['Instagram'])
ax3.set_xticklabels(['YouTube'])
ax2.set_yticks(np.arange(32, 50, step=2))
plt.yticks(np.arange(5, 50, step=5))
plt.show()

In [None]:
#Show columns in social apps
socialApps_df.columns

In [None]:
#Aggregate Averages for Tendency by Category and Total
totalScoreNumberOfApps = socialApps_df.groupby('App Groups').agg({'ADHD Total Score': 'mean', 'Self Esteem Total Score': 'mean',
       'Anxiety Total Score': 'mean', 'Depression Total Score': 'mean', 'Total Score': 'mean'})

X_axis = np.arange(len(totalScoreNumberOfApps.index))

#Plot the average question type scores amongst the groups
plt.bar(X_axis - 0.34, totalScoreNumberOfApps['Self Esteem Total Score'], width=0.2, edgecolor='black', zorder=3)
plt.bar(X_axis - 0.11, totalScoreNumberOfApps['Anxiety Total Score'], width=0.2, edgecolor='black', zorder=3)
plt.bar(X_axis + 0.11, totalScoreNumberOfApps['Depression Total Score'], width=0.2, edgecolor='black', zorder=3)
plt.bar(X_axis + 0.34, totalScoreNumberOfApps['ADHD Total Score'], width=0.2, edgecolor='black', zorder=3)
plt.xticks(X_axis, totalScoreNumberOfApps.index)
plt.yticks(np.arange(0, 20 , step=2))
plt.grid(axis='y', color='gray', linewidth=0.4, zorder=0)
plt.legend(['Avg Self Esteem Total Score (2 Q)', 'Avg Anxiety Total Score (2 Q)', 'Avg Depression Total Score (3 Q)','Avg ADHD Total Score (4 Q)'], bbox_to_anchor=(1, 1))
plt.ylabel('Average Scores *Higher being worse')
plt.xlabel('Collection of Platform Groups')
plt.annotate("31", (0,14))
plt.annotate("35", (1,14))
plt.annotate("37", (2,14))
plt.title('Average Tendency Score vs Number of Platorms')
plt.savefig('output_data/averageTendencyScorePlatforms')
plt.show()
totalScoreNumberOfApps

In [None]:
#Plot the distribution of Platforms used
variationNumberOfApps = socialApps_df.groupby('Number of Apps')

test3 = variationNumberOfApps.count()
plt.bar(test3.index, test3['2. Gender'], align='center', edgecolor='black')
plt.ylabel('Total Number of Recipients')
plt.xlabel('Number of Platforms')
plt.title('Distribution of Number of Platforms vs Number of Recipients')
plt.xticks(test3.index)
plt.savefig('output_data/distributionNumberOfPlatforms')
plt.show()

In [None]:
#Scatter plot number of apps vs total frequency score
plt.scatter(socialApps_df['Number of Apps'], socialApps_df['Total Score'] , marker='o', alpha=0.6, edgecolors='black', s=60)

#Perform linear regression
slope, intercept, r, p, stderr = linregress(socialApps_df['Number of Apps'], socialApps_df['Total Score'])

#y=mx+b
line = slope * socialApps_df['Number of Apps'] + intercept

#Create plot
plt.plot(socialApps_df['Number of Apps'], line, 'r')
plt.annotate(f"y={slope:0.02f}x + {intercept:0.02f}", (6, 10), color='r')
plt.ylabel('Total Score from Recipients')
plt.xlabel('Number of Platforms')
plt.title('Total Frequency Score vs Number of Platforms')
plt.yticks(np.arange(5, 65, step=5))
plt.savefig('output_data/totalFrequencyVsNumberPlatforms')
plt.show()
#Weak Correlation
print(f"Pearson Correlation Factor: {r:0.02f}, Weak Correlation")