# Online Gaming and Anxiety

(Data Cleaning done by Aliyu Muraina, Amy Dohlin, Andrew Arjune and Anna Bitzer)

In [None]:
# import dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import scipy.stats as st

In [None]:
# Create path to CSV
gaming = Path("GamingStudy/Scrub_gaming_data.csv")

In [None]:
# Read in gaming data from CSV file
# Assign to dataframe called gaming_df
gaming_df = pd.read_csv(gaming)
# Print entire dataframe
gaming_df

In [None]:
gaming_df.columns

In [None]:
clean_game = gaming_df[["S. No.","Game","Platform","Hours","earnings","whyplay","Gender","Age","Work","Degree","Residence","Playstyle","GAD_T","SWL_T","SPIN_T"]]
clean_game

In [None]:
#save cleaned dataframe to new csv
clean_game.to_csv("clean_game_data.csv")


# Research Question: Are people who play for fun more likely to be gainfully employed?

(Analysis completed by Aliyu Muraina)

In [None]:
# To get the total number of people that play for fun, we need to sum the number of people who play for fun
#money_players = df[df['MoneySpentPerWeek'] > 0]['MoneySpentPerWeek'].count()

Numb_of_people_whyplay = clean_game["whyplay"].value_counts()
Numb_of_people_whyplay

In [None]:
#Collect total number of people playing for earnings
total_numb_of_earning_players = clean_game['earnings'].value_counts()
total_numb_of_earning_players

In [None]:
#collect the total number of people that plays for fun only
total_fun_players =Numb_of_people_whyplay["having fun"].sum()
#total_fun_players
#collect the total number of people playing for money
total_money_players =Numb_of_people_whyplay["winning"].sum()
#total_money_players

In [None]:
#build a datafram that shows the number of people who play for fun and for winning
combined_data =pd.DataFrame({"Having fun":[total_fun_players],"Winning":[total_money_players]})
combined_data


In [None]:
# I want to generate a bar chart to have proper visualization for both "Having fun" and "Winning of the data set in a pyplot"
# I am using pyplot for my barchart so as to have more control over my chart to
# Generate a bar plot showing the comparison between people who play for money vs for fun

money_df = combined_data[combined_data["Winning"] == 'Yes']
fun_df = combined_data[combined_data['Having fun'] == 'Yes']

x = ['Winning', 'Having fun']
y = [total_money_players,total_fun_players]

plt.bar(x,y)
plt.title('Players Who Play for Money vs For Fun')
plt.xlabel('Play Type')
plt.ylabel('Number of Players')

plt.show()



In [None]:
#For Fun: Are people who play for fun are likely to be gainfully employed
#to get the amoutn of people who play for fun are employed
#get the data of work from the clean_game dataframe
Employed_people = clean_game.loc[clean_game['Work'] == 'Employed']
Employed_people.head()



In [None]:
Numb_of_people_work = clean_game["Work"].value_counts()
Numb_of_people_work

In [None]:
Employed_people = Numb_of_people_work["Employed"]
Employed_people

In [None]:
Unemployed_people = Numb_of_people_work["Unemployed / between jobs"].sum()
Unemployed_people

In [None]:
Wanted_df = clean_game[["whyplay","Work"]]
Wanted_df.head

In [None]:
x = ['whyplay','Work']
y1 = [10,20]
y2 = [5,15]
plt.bar(x,y1,label='Work')
plt.bar(x,y2,label='Whyplay')
plt.legend()
plt.show()


In [None]:
# Build a datafram for both whyplay(having_fun) and Work(Employed)
Combined_df =pd.DataFrame({"Having fun":[total_fun_players],"Employed":[Employed_people]})
Combined_df.head()

In [None]:
# Filter data to only people who are having fun

having_fun = ["Having fun", "Employment"]

# Filter having_fun data to employment categories#
having_fun_employment = Combined_df[Combined_df['Having fun'].isin(having_fun)]
having_fun_employment

# Create bar chart of employment for people having fun





In [None]:
#% of employed vs % of unemployed
# % of employed
employed_df = Numb_of_people_work ["Employed"].sum()
percentage_employed_df = employed_df / (Numb_of_people_work) * 100
percentage_employed_df


# Research Question: Americans vs Europeans: Who spends more time gaming, and how does it compare to their satisfaction with life scores?

(Analysis completed by Amy Dohlin)

In [None]:
# create a df to see only S. No. (survey number), Residence, and SWL_T (satisfaction with life total score)
residence_df = clean_game[["S. No.","Hours", "Residence","SWL_T"]]

# pull list of countries out of Residence column
residence_df["Residence"].unique()

# drop non-US and non_European countries
# coding help from Xpert Learning Assistant
# European countries reference: https://www.worldometers.info/geography/how-many-countries-in-europe/#google_vignette
patterns_to_drop = ['South Korea', 'Japan', 'Canada', 
       'Australia', 'Malaysia',
       'Turkey', 'South Africa', 'India',
       'Argentina',
       'Singapore', 'China', 'Unknown', 
       'Saudi Arabia',
       'Jordan', 'Brazil', 'Venezuela', 'Tunisia', 'Israel',
       'Qatar', 'Mexico', 'Philippines',
       'Egypt', 'New Zealand ', 
       'Algeria', 'UAE', 'Chile', 'Lebanon',
       'Thailand', 'Peru', 'Namibia', 'Uruguay', 'Kuwait', 'Bangladesh',
       'Syria', 'Colombia', 'Dominican Republic', 'Nicaragua',
       'Hong Kong', 'Vietnam', 'Kazakhstan', 'Guadeloupe',
       'Grenada', 'Trinidad & Tobago', 'Panama', 'Indonesia',
       'Puerto Rico', 'Taiwan', 'Costa Rica', 'Belize',
       'Jamaica', 'Georgia', 'Faroe Islands', 'Guatemala', 'Moldova',
       'Mongolia', 'Honduras', 'Bahrain', 'Bolivia',
       'El Salvador', 'Ecuador', 'Pakistan', 'Republic of Kosovo',
       'St Vincent', 'Brunei', 'Fiji', 'Gibraltar ', 'Palestine']

residence_df = residence_df[~residence_df["Residence"].str.contains('|'.join(patterns_to_drop))]
residence_df                                                            

In [None]:
# add Region col for USA and Europe (only those two)
residence_df["USA or Eur"] = np.nan

# assign USA or Eur to simplify regions
for index, country in residence_df.iterrows():
    if country["Residence"] == "USA":
        residence_df.at[index, "USA or Eur"] = "USA"
    else:
        residence_df.at[index, "USA or Eur"] = "Europe"
residence_df.head()

# total hours played (per week) by USA residents vs hours played (per week) by European residents
count_residence = residence_df["USA or Eur"].value_counts()
print(count_residence)

print("-----------------------------------------------------------------------")

usa_hours = residence_df.loc[residence_df["USA or Eur"] == "USA", "Hours"].sum()
print("The total number of hours played per week by gamers in the USA is " + str(usa_hours))

eur_hours = residence_df.loc[residence_df["USA or Eur"]== "Europe", "Hours"].sum()
print("The total number of hours played per week by gamers in Europe is " + str(eur_hours))

print("-----------------------------------------------------------------------")

# average hours played (per week) by USA residents vs average hours played (per week) by European residents
usa_hours_avg = residence_df.loc[residence_df["USA or Eur"] == "USA", "Hours"].mean()
print("The average number of hours played per week by gamers in the USA is " + str(usa_hours_avg))

eur_hours_avg = residence_df.loc[residence_df["USA or Eur"]== "Europe", "Hours"].mean()
print("The average number of hours played per week by gamers in Europe is " + str(eur_hours_avg))

print("-----------------------------------------------------------------------")

# average SWL scores for USA residents and European residents
usa_swl = residence_df.loc[residence_df["USA or Eur"] == "USA", "SWL_T"].mean()
print("The average SWL score for USA residents is " + str(usa_swl))

eur_swl = residence_df.loc[residence_df["USA or Eur"] == "Europe", "SWL_T"].mean()
print("The average SWL score for European residents is " + str(eur_swl))

In [None]:
# Create pie chart of US players vs European players.

# format labels and colors
labels = ["European Players","USA Players"]
sizes = [6498, 4569]
colors = ["lightskyblue","red"]
explode = [0.1, 0]

# create pie chart with the above values. Add a title
plt.pie(sizes, explode =explode, labels = labels, colors = colors, autopct = "%1.1f%%", shadow = True, startangle = 90)
plt.title("Distribution of USA Players and European Players")
plt.savefig("Images/USEuroPlayersPie.png")
plt.show()

In [None]:
# Create a scatter plot comparing SWL Scores and Hours Played (USA vs Europe)

# find mins and maxes of hours played and SWL scores
hours_max = residence_df["Hours"].max()
hours_min = residence_df["Hours"].min()

swl_max = residence_df["SWL_T"].max()
swl_min = residence_df["SWL_T"].min()

print("Max hours played: " + str(hours_max))
print("Min hours played: " + str(hours_min))
print("Max SWL score: " + str(swl_max))
print("Min SWL score: " + str(swl_min))

# Filter data based on categories
swl_usa = residence_df.loc[residence_df["USA or Eur"] == "USA", "SWL_T"]
swl_europe = residence_df.loc[residence_df["USA or Eur"] == "Europe", "SWL_T"]

# Create histograms for "USA" and "Europe" categories
plt.figure(figsize=(12, 6))

plt.hist(swl_europe, color='lightskyblue', alpha=0.7, label='Europe', bins=10)
plt.hist(swl_usa, color='red', alpha=0.7, label='USA', bins=10)

plt.title('Distribution of SWL Scores by Category')
plt.xlabel('Satisfaction of Life Score')
plt.ylabel('Frequency')
plt.legend()
plt.savefig("Images/SWLHist.png")

plt.show()

In [None]:
# Create histograms for "USA" and "Europe" categories
# Filter data based on categories
hours_usa = residence_df.loc[residence_df["USA or Eur"] == "USA", "Hours"]
hours_europe = residence_df.loc[residence_df["USA or Eur"] == "Europe", "Hours"]

plt.figure(figsize=(12, 6))

plt.hist(hours_europe, color='lightskyblue', alpha=0.7, label='Europe', bins=10)
plt.hist(hours_usa, color='red', alpha=0.7, label='USA', bins=10)

plt.title('Distribution of Hours Played by Category')
plt.xlabel('Hours Played per Week')
plt.ylabel('Frequency')
plt.legend()
plt.savefig("Images/HoursHist.png")

plt.show()

In [None]:
# create summary stats for US hours played vs Euro hours played

hours_df = residence_df[["USA or Eur","Hours"]]
grouped_hours = hours_df.groupby("USA or Eur")

grouped_hours = grouped_hours.describe()
grouped_hours


In [None]:
# create summary stats for US SWL scores vs Euro SWL scores
swl_df = residence_df[["USA or Eur","SWL_T"]]
grouped_swl = swl_df.groupby("USA or Eur")

grouped_swl = grouped_swl.describe()
grouped_swl

In [None]:
# t-test for SWL for both regions

st.ttest_ind(swl_usa, swl_europe, equal_var=False)

# Research Question: Do professional players have a better satisfaction with life (SWL) score than casual players?

(Analysis completed by Andrew Arjune)

In [None]:
level_df = gaming_df[["earnings", "GAD_T"]]
level_df = level_df.rename(columns={"S. No.": "Survey Number"})
level_df = level_df.set_index("Survey Number")

In [None]:
level_df["earnings"].value_counts()

#keep only the top 5 playstyles
level_filtered = level_df[(level_df["earnings"] == "I play for fun")
                                          |(level_df["earnings"] == "I play mostly for fun but earn a little on the side (tournament winnings  streaming  etc)")
                                          |(level_df["earnings"] == "I earn a living by playing this game")]

all_level = (len(level_df))      
filtered_level = (len(level_filtered))

dropped_level = all_level - filtered_level
percent_dropped = round((dropped_level/all_level) * 100)

print(f'For the scope of this analysis, only the top 3 levels of play were included. A total of {dropped_level} data points ({percent_dropped}% of the total) were dropped.')

In [None]:
#create a piechart to give a visual of the level distribution
#calculate number of each level
level_breakdown = level_filtered["earnings"].value_counts()
#plot, format percentages, format visuals, add title
level_breakdown.plot(kind = "pie", title = "Breakdown of Levels",
                     autopct = "%1.1f%%", explode = (0.1, 0, 0, 0, 0), shadow = True, 
                     startangle = 140, ylabel = "")

plt.show()

In [None]:
l_min = level_filtered["GAD_T"].min()
l_max = level_filtered["GAD_T"].max()
l_mean = level_filtered["GAD_T"].mean()
l_median = level_filtered["GAD_T"].median()
l_std = level_filtered["GAD_T"].std()

l_GAD_summary = pd.DataFrame([{"Min. GAD Result": l_min, "Max. GAD Result": l_max,
"Mean GAD Result": l_mean, "Median GAD Result": l_median, "GAD Std. Dev.": l_std}])

l_GAD_summary["Mean GAD Result"] = round(l_GAD_summary["Mean GAD Result"], 2)
l_GAD_summary["GAD Std. Dev."] = round(l_GAD_summary["GAD Std. Dev."], 2)

l_GAD_summary

In [None]:
gad_summary = level_filtered.groupby(["earnings"])

e_min = gad_summary["GAD_T"].min()
e_max = gad_summary["GAD_T"].max()
e_mean = gad_summary["GAD_T"].mean()
e_median = gad_summary["GAD_T"].median()
e_std = gad_summary["GAD_T"].std()

e_GAD_summary = pd.DataFrame({"Min. GAD Result": e_min, "Max. GAD Result": e_max,
"Mean GAD Result": e_mean, "Median GAD Result": e_median, "GAD Std. Dev.": e_std})

e_GAD_summary["Mean GAD Result"] = round(e_GAD_summary["Mean GAD Result"], 2)
e_GAD_summary["GAD Std. Dev."] = round(e_GAD_summary["GAD Std. Dev."], 2)

e_GAD_summary

e_GAD_short_summary = pd.DataFrame({"Mean GAD Result": e_mean, "Median GAD Result": e_median, "GAD Std. Dev.": e_std})
e_GAD_short_summary["Mean GAD Result"] = round(e_GAD_summary["Mean GAD Result"], 2)
e_GAD_short_summary["GAD Std. Dev."] = round(e_GAD_summary["GAD Std. Dev."], 2)

e_GAD_short_summary

In [None]:
fig1, ax1 = plt.subplots()
e_list = ["I play for fun", 
                "I play mostly for fun but earn a little on the side (tournament winnings  streaming  etc)",
                "I earn a living by playing this game", ]

e_plot = []

for level in e_list:
    to_plot = level_filtered.loc[(level_filtered["earnings"] == earnings)]["GAD_T"]
    e_plot.append(to_plot)

ax1.boxplot(e_plot)
ax1.set_xticklabels(e_list, wrap = True)
ax1.set_ylabel("GAD Score")
plt.title("GAD Score Distribution by Competitive Level")
#https://stackoverflow.com/questions/26700598/matplotlib-showing-x-tick-labels-overlapping
fig1.autofmt_xdate()

plt.show()

# Research Question: Are gamers that play alone more anxious than those who play socially?

(Analysis completed by Anna Bitzer)

In [None]:
#filter dataset further to only relevant columns
playstyle_df = gaming_df[["S. No.", "Gender", "Age", "Playstyle", "GAD_T"]]
playstyle_df = playstyle_df.rename(columns={"S. No.": "Survey Number"})
playstyle_df = playstyle_df.set_index("Survey Number")

playstyle_df.head()

In [None]:
#check for list of play styles
playstyle_df["Playstyle"].value_counts()

#keep only the top 5 playstyles
playstyle_filtered = playstyle_df[(playstyle_df["Playstyle"] == "Multiplayer - online - with real life friends")
                                          |(playstyle_df["Playstyle"] == "Multiplayer - online - with strangers")
                                          |(playstyle_df["Playstyle"] == "Multiplayer - online - with online acquaintances or teammates")
                                          |(playstyle_df["Playstyle"] == "Singleplayer")
                                          |(playstyle_df["Playstyle"] == "Multiplayer - offline (people in the same room)")]

all_playstyle = (len(playstyle_df))      
filtered_playstyle = (len(playstyle_filtered))

dropped_playstyle = all_playstyle - filtered_playstyle
percent_dropped = round((dropped_playstyle/all_playstyle) * 100)

print(f'For the scope of this analysis, only the top 5 playstyles were included. A total of {dropped_playstyle} data points ({percent_dropped}% of the total) were dropped.')


In [None]:
#create a piechart to give a visual of the playstyle distribution
#calculate number of each playstyle
playstyle_breakdown = playstyle_filtered["Playstyle"].value_counts()
#plot, format percentages, format visuals, add title
playstyle_breakdown.plot(kind = "pie", title = "Breakdown of Playstyles",
                     autopct = "%1.1f%%", explode = (0.1, 0, 0, 0, 0), shadow = True, 
                     startangle = 140, ylabel = "")

plt.show()


In [None]:
#calculate summary stats of GAD score for overall population (min, max, mean, median, std dev)
#p + stat denotes overall population result
p_min = playstyle_filtered["GAD_T"].min()
p_max = playstyle_filtered["GAD_T"].max()
p_mean = playstyle_filtered["GAD_T"].mean()
p_median = playstyle_filtered["GAD_T"].median()
p_std = playstyle_filtered["GAD_T"].std()

#add summary stats to a dataframe
p_GAD_summary = pd.DataFrame([{"Min. GAD Result": p_min, "Max. GAD Result": p_max,
"Mean GAD Result": p_mean, "Median GAD Result": p_median, "GAD Std. Dev.": p_std}])

#format dataframe
p_GAD_summary["Mean GAD Result"] = round(p_GAD_summary["Mean GAD Result"], 2)
p_GAD_summary["GAD Std. Dev."] = round(p_GAD_summary["GAD Std. Dev."], 2)

p_GAD_summary


In [None]:
#calculate min, max, mean, median, std. dev. of GAD score for each playstyle
gad_summary = playstyle_filtered.groupby(["Playstyle"])
#ps + stat denotes specific playstyle
ps_min = gad_summary["GAD_T"].min()
ps_max = gad_summary["GAD_T"].max()
ps_mean = gad_summary["GAD_T"].mean()
ps_median = gad_summary["GAD_T"].median()
ps_std = gad_summary["GAD_T"].std()

#add summary stats to a dataframe
ps_GAD_summary = pd.DataFrame({"Min. GAD Result": ps_min, "Max. GAD Result": ps_max,
"Mean GAD Result": ps_mean, "Median GAD Result": ps_median, "GAD Std. Dev.": ps_std})

#format dataframe
ps_GAD_summary["Mean GAD Result"] = round(ps_GAD_summary["Mean GAD Result"], 2)
ps_GAD_summary["GAD Std. Dev."] = round(ps_GAD_summary["GAD Std. Dev."], 2)

ps_GAD_summary

ps_GAD_short_summary = pd.DataFrame({"Mean GAD Result": ps_mean, "Median GAD Result": ps_median, "GAD Std. Dev.": ps_std})
ps_GAD_short_summary["Mean GAD Result"] = round(ps_GAD_summary["Mean GAD Result"], 2)
ps_GAD_short_summary["GAD Std. Dev."] = round(ps_GAD_summary["GAD Std. Dev."], 2)

ps_GAD_short_summary

While all styles of playing video games were associated with similar average GAD scores, the total average GAD score of players who played alone was higher than players that played multiplayer.

In [None]:
#generate boxplot with playstyle versus GAD score
fig1, ax1 = plt.subplots()
ps_list = ["Multiplayer - offline (people in the same room)", 
                "Multiplayer - online - with real life friends",
                "Multiplayer - online - with strangers", 
                "Multiplayer - online - with online acquaintances or teammates",
                "Singleplayer"]
#empty list to hold plot data in                
ps_plot = []

#select data for each playstyle
for playstyle in ps_list:
    to_plot = playstyle_filtered.loc[(playstyle_filtered["Playstyle"] == playstyle)]["GAD_T"]
    ps_plot.append(to_plot)

ax1.boxplot(ps_plot)
ax1.set_xticklabels(ps_list, wrap = True)
ax1.set_ylabel("GAD Score")
plt.title("GAD Score Distribution by Playstyle")
#https://stackoverflow.com/questions/26700598/matplotlib-showing-x-tick-labels-overlapping
fig1.autofmt_xdate()

plt.show()


The GAD data was not normally distributed, and was instead right skewed, with the median less than the mean for all playstyle categories.

In [None]:
#notably, there are a lot of outliers for each Playstyle determine the list of outliers for each 
outliers = []

for playstyle in ps_list:
    #first, determine the quartiles
    quartiles = list(playstyle_filtered[(playstyle_filtered["Playstyle"] == playstyle)]["GAD_T"].quantile([0.25, 0.5, 0.75]))
    lower_q = quartiles[0]
    upper_q = quartiles[2]
    iqr = upper_q - lower_q
    #set boundaries to search for outliers
    upper_outliers = upper_q + (1.5*iqr)
    lower_outliers = lower_q - (1.5*iqr)
    #set variable for list of GAD score for specific playstyle
    gad_scores = playstyle_filtered[(playstyle_filtered["Playstyle"] == playstyle)]["GAD_T"]
    #determine how many outliers there are
    upper_outliers_count = gad_scores.loc[(gad_scores > upper_outliers)].count()
    lower_outliers_count = gad_scores.loc[(gad_scores < lower_outliers)].count()
    #determine total number of outliers
    outlier_total = upper_outliers_count + lower_outliers_count
    #determine what percentage of total results are considered outliers
    outlier_percent = round(outlier_total/(gad_scores.count()) *100)
    print(f'{playstyle} has {outlier_total} outliers ({outlier_percent}% of data points).')
    #reset outlier list before next playstyle
    outliers = []


In [None]:
#the largest difference in mean GAD score is between the groups Multiplayer - offline (people in the same room) and Singleplayer
#use the Welch's t-test to determine if there is a significant difference between these two groups
multi_offline = playstyle_filtered[(playstyle_filtered["Playstyle"] == "Multiplayer - offline (people in the same room)")]["GAD_T"]
single = playstyle_filtered[(playstyle_filtered["Playstyle"] == "Singleplayer")]["GAD_T"]

stats.ttest_ind(multi_offline, single, equal_var = False)


A Welch's t-test was performed to compare the group with the lowest mean GAD score of 4.21, gamers that played offline multiplayer, against the group with the highest mean GAD score of 5.80, gamers that played single player. It was found that there is a slightly significant difference between the two groups, with a p-value of 0.03. 

It should be noted that these two groups were by far the smallest of the playstyles analyzed - 5.8% of the population played single player, and only 0.4% of the population played offline multiplayer. 

Sources used by Anna:
https://stackoverflow.com/questions/26700598/matplotlib-showing-x-tick-labels-overlapping
Bootcampspot: Xpert Learning Assistant (used for outlier section, and for help formatting boxplot)