In [1]:
#import all necessary libraries
import plotly.express as px
import pandas as pd
import numpy as np
import warnings

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/Yukino09/Data-Mining-HW1/main/processed_data.csv")
df.head()

Unnamed: 0,Sex,Playing Years,Playing Often,Playing Hours,Playing Games,Grade
0,0,1,2,1,1,77.5
1,1,1,3,1,1,83.0
2,0,0,0,0,0,80.0
3,0,3,5,1,1,45.0
4,1,1,1,2,1,85.0


In [3]:
# Gender converted to categorical labels (0=female, 1=male, assuming 0/1 represents gender in the data)
df["Sex"] = df["Sex"].map({0: "Female", 1: "Male"})

# Grouped box plot: horizontal axis represents game frequency, grouped by gender; vertical axis represents scores
fig = px.box(
    df,
    x="Playing Often",
    y="Grade",
    color="Sex",
    title="The joint impact of gender and playing often on performance",
    labels={"Playing Often": "Game Often", "Grade": "Grade", "Sex": "Sex"},
    category_orders={"Playing Often": [1, 2, 3, 4, 5]},  # Ensure that frequencies are in ordinal order
    color_discrete_map={"Female": "#FF69B4", "Male": "#4169E1"}  
)

fig.update_layout(
    boxmode="group",  
    xaxis_title="playing Often",
    yaxis_title="Sex",
    hovermode="x unified",
    template="plotly_white"
)
fig.show()

In [4]:
# Binning Process: Bin the game years and duration
df["Playing Years Bin"] = pd.cut(df["Playing Years"], bins=3, labels=["Low", "Medium", "High"])
df["Playing Hours Bin"] = pd.cut(df["Playing Hours"], bins=3, labels=["Short", "Medium", "Long"])

# Calculate the average score for each bin combination
heatmap_data = df.groupby(["Playing Years Bin", "Playing Hours Bin"])["Grade"].mean().reset_index()

# Create heatmap
fig = px.density_heatmap(
    heatmap_data,
    x="Playing Years Bin",
    y="Playing Hours Bin",
    z="Grade",
    title="The Impact of Playing years and Playing Often Bins on Performance ",
    labels={"Grade": "Average Grade"},
    color_continuous_scale="YlGnBu",
    text_auto=True  
)
fig.update_layout(
    xaxis_title="Playing Years Binning",
    yaxis_title="Playing Often Binning",
    template="plotly_white"
)
fig.show()






In [5]:
bins = [0, 60, 80, 100]  
labels = ["Low (0-60)", "Medium (60-80)", "High (80-100)"]
df["Grade Category"] = pd.cut(df["Grade"], bins=bins, labels=labels, right=False)

# Calculate the correlation coefficient
correlation = df["Playing Hours"].corr(df["Grade"])
correlation_text = f"Correlation: {correlation:.2f}"

# Scatter Plot + Regression Line
fig = px.scatter(df, 
                 x="Playing Hours", 
                 y="Grade", 
                 color="Grade Category",  
                 trendline="ols",  
                 title="Relationship between Playing Hours and Grade",
                 labels={"Playing Hours": "Playing Hours (hours per day)", "Grade": "Grade"},
                 opacity=0.7,  
                 color_discrete_sequence=px.colors.qualitative.Set1)  

# Display correlation coefficients on the graph.
fig.add_annotation(
    x=max(df["Playing Hours"]), y=max(df["Grade"]),  
    text=correlation_text,
    showarrow=False,
    font=dict(size=14, color="black"),
    align="right",
    bgcolor="rgba(255, 255, 255, 0.7)"
)

# Optimize Chart Layout
fig.update_layout(
    xaxis_title="Playing Hours (hours per day)",
    yaxis_title="Grade",
    legend_title="Grade Category"
)

# Show Chart
fig.show()

# Output correlation coefficient
print(f"Correlation coefficient between game time and scores: {correlation:.2f}")

Correlation coefficient between game time and scores: -0.06


In [6]:
warnings.simplefilter(action='ignore', category=FutureWarning)

bins = [0, 60, 80, 100]
df['bins'] = pd.cut(df['Grade'], bins=bins, right=True)
df['bins_str'] = df['bins'].astype(str)
df['Playing Games'] = df['Playing Games'].astype(str)

bin_counts = df.groupby(['bins_str', 'Playing Games']).size().reset_index(name='count')
bin_totals = bin_counts.groupby('bins_str')['count'].transform('sum')
bin_counts['percentage'] = (bin_counts['count'] / bin_totals) * 100

fig = px.bar(bin_counts,
             x='bins_str', 
             y='percentage', 
             color='Playing Games', 
             text='percentage',
             title='The Impact of Playing Games on Grade',
             color_discrete_sequence=px.colors.qualitative.Dark24, 
             labels={'percentage': 'Percentage (%)'})


fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')
fig.show()