In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

chess_url = 'C:\\Users\\kosot\\OneDrive\\Masa端st端\\CS210-HW2\\Project\\DSA210_Term_Project_bugrahan.yapilmisev\\Data_Extraction\\filtered_chess_games_modified_png.csv'
academic_url = 'C:\\Users\\kosot\\OneDrive\\Masa端st端\\CS210-HW2\\Project\\DSA210_Term_Project_bugrahan.yapilmisev\\Data_Extraction\\exam_and_assignment_dates.csv'
# Load the data
chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)

# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
daily_chess_usage = chess_data.groupby('date').size().reset_index(name='chess_count')

# Preprocess academic data
academic_data['date'] = pd.to_datetime(academic_data['date']).dt.date
academic_data['workload'] = 1  # Assign a workload score for each event
daily_workload = academic_data.groupby('date')['workload'].sum().reset_index()

# Merge datasets
merged_data = pd.merge(daily_chess_usage, daily_workload, on='date', how='outer').fillna(0)

# Plot scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(merged_data['workload'], merged_data['chess_count'], alpha=0.7)
plt.xlabel('Academic Workload')
plt.ylabel('Chess Usage (Game Count)')
plt.title('Academic Workload vs. Chess Usage')
plt.grid(True)
plt.show()

# Time-series plot
plt.figure(figsize=(12, 6))
plt.plot(merged_data['date'], merged_data['workload'], label='Workload', marker='o')
plt.plot(merged_data['date'], merged_data['chess_count'], label='Chess Usage', marker='x')
plt.xlabel('Date')
plt.ylabel('Counts')
plt.title('Trends in Workload and Chess Usage Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Correlation analysis
correlation, p_value = pearsonr(merged_data['workload'], merged_data['chess_count'])
print(f"Pearson Correlation: {correlation:.2f}")
print(f"P-value: {p_value:.2e}")

# Conclusion
if p_value < 0.05:
    print("There is a significant correlation between academic workload and chess usage.")
    if correlation > 0:
        print("Hypothesis supported: Chess usage increases with academic workload.")
    else:
        print("Hypothesis not supported: Chess usage decreases with academic workload.")
else:
    print("No significant correlation found.")


In [None]:
chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)

# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
chess_data['win'] = chess_data['result_standardized'].apply(lambda x: 1 if x == 'win' else 0)

# Aggregate chess usage
daily_chess = chess_data.groupby('date').agg(
    chess_count=('url', 'size'),
    win_rate=('win', 'mean')
).reset_index()

# Preprocess academic data
academic_data['date'] = pd.to_datetime(academic_data['date']).dt.date
academic_data['workload'] = 1  # Assign a workload score for each event
daily_workload = academic_data.groupby('date')['workload'].sum().reset_index()

# Merge datasets
merged_data = pd.merge(daily_chess, daily_workload, on='date', how='outer').fillna(0)

# Scatter plot: Academic workload vs Chess usage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='workload', y='chess_count', data=merged_data, alpha=0.7)
plt.xlabel('Academic Workload')
plt.ylabel('Chess Usage (Game Count)')
plt.title('Academic Workload vs Chess Usage')
plt.grid(True)
plt.show()

# Line plots for workload, chess usage, and win rate
plt.figure(figsize=(12, 6))
plt.plot(merged_data['date'], merged_data['workload'], label='Workload', marker='o')
plt.plot(merged_data['date'], merged_data['chess_count'], label='Chess Usage', marker='x')
plt.plot(merged_data['date'], merged_data['win_rate'], label='Win Rate', marker='^')
plt.xlabel('Date')
plt.ylabel('Counts/Win Rate')
plt.title('Trends in Workload, Chess Usage, and Win Rate Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Boxplot: Workload grouped by win/lose
plt.figure(figsize=(10, 6))
sns.boxplot(x='workload', y='win_rate', data=merged_data)
plt.xlabel('Academic Workload')
plt.ylabel('Win Rate')
plt.title('Workload vs Win Rate')
plt.grid(True)
plt.show()

# Heatmap: Correlation matrix
correlation_matrix = merged_data[['workload', 'chess_count', 'win_rate']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix for Workload, Chess Usage, and Win Rate')
plt.show()

# Statistical Analysis
corr_usage, p_usage = pearsonr(merged_data['workload'], merged_data['chess_count'])
corr_win_rate, p_win_rate = pearsonr(merged_data['workload'], merged_data['win_rate'])

print(f"Correlation between workload and chess usage: {corr_usage:.2f} (p={p_usage:.2e})")
print(f"Correlation between workload and win rate: {corr_win_rate:.2f} (p={p_win_rate:.2e})")

# Interpretation of Results
if p_usage < 0.05:
    print("Significant correlation between workload and chess usage.")
    if corr_usage > 0:
        print("Hypothesis supported: Chess usage increases with workload.")
    else:
        print("Hypothesis not supported: Chess usage decreases with workload.")
else:
    print("No significant correlation between workload and chess usage.")

if p_win_rate < 0.05:
    print("Significant correlation between workload and win rate.")
    if corr_win_rate > 0:
        print("Higher workload correlates with better win rates.")
    else:
        print("Higher workload correlates with worse win rates.")
else:
    print("No significant correlation between workload and win rate.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr


chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)
# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
chess_data['win'] = chess_data['result_standardized'].apply(lambda x: 1 if x == 'win' else 0)

# Aggregate chess data by day
daily_chess = chess_data.groupby('date').agg(
    chess_count=('url', 'size'),
    win_rate=('win', 'mean')
).reset_index()

# Preprocess academic data
academic_data['date'] = pd.to_datetime(academic_data['date']).dt.date

# Extend workload over the previous 5 days
extended_workload = []
for _, row in academic_data.iterrows():
    for offset in range(0, 10):  # Include current day and 5 previous days
        extended_workload.append({'date': row['date'] - pd.Timedelta(days=offset), 'workload': 1})

extended_workload = pd.DataFrame(extended_workload)
daily_workload = extended_workload.groupby('date').sum().reset_index()

# Merge datasets
merged_data = pd.merge(daily_chess, daily_workload, on='date', how='outer').fillna(0)

# Scatter plot: Academic workload vs Chess usage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='workload', y='chess_count', data=merged_data, alpha=0.7)
plt.xlabel('Academic Workload')
plt.ylabel('Chess Usage (Game Count)')
plt.title('Academic Workload vs Chess Usage')
plt.grid(True)
plt.show()

# Line plots for workload, chess usage, and win rate
plt.figure(figsize=(12, 6))
plt.plot(merged_data['date'], merged_data['workload'], label='Workload', marker='o')
plt.plot(merged_data['date'], merged_data['chess_count'], label='Chess Usage', marker='x')
plt.plot(merged_data['date'], merged_data['win_rate'], label='Win Rate', marker='^')
plt.xlabel('Date')
plt.ylabel('Counts/Win Rate')
plt.title('Trends in Workload, Chess Usage, and Win Rate Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Boxplot: Workload grouped by win/lose
plt.figure(figsize=(10, 6))
sns.boxplot(x='workload', y='win_rate', data=merged_data)
plt.xlabel('Academic Workload')
plt.ylabel('Win Rate')
plt.title('Workload vs Win Rate')
plt.grid(True)
plt.show()

# Heatmap: Correlation matrix
correlation_matrix = merged_data[['workload', 'chess_count', 'win_rate']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix for Workload, Chess Usage, and Win Rate')
plt.show()

# Statistical Analysis
corr_usage, p_usage = pearsonr(merged_data['workload'], merged_data['chess_count'])
corr_win_rate, p_win_rate = pearsonr(merged_data['workload'], merged_data['win_rate'])

print(f"Correlation between workload and chess usage: {corr_usage:.2f} (p={p_usage:.2e})")
print(f"Correlation between workload and win rate: {corr_win_rate:.2f} (p={p_win_rate:.2e})")

# Interpretation of Results
if p_usage < 0.05:
    print("Significant correlation between workload and chess usage.")
    if corr_usage > 0:
        print("Hypothesis supported: Chess usage increases with workload.")
    else:
        print("Hypothesis not supported: Chess usage decreases with workload.")
else:
    print("No significant correlation between workload and chess usage.")

if p_win_rate < 0.05:
    print("Significant correlation between workload and win rate.")
    if corr_win_rate > 0:
        print("Higher workload correlates with better win rates.")
    else:
        print("Higher workload correlates with worse win rates.")
else:
    print("No significant correlation between workload and win rate.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# Load the data
chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)

# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
chess_data['win'] = chess_data['result_standardized'].apply(lambda x: 1 if x == 'win' else 0)

# Aggregate chess data by day
daily_chess = chess_data.groupby('date').agg(
    chess_count=('url', 'size'),
    win_rate=('win', 'mean')
).reset_index()

# Preprocess academic data with specific date format
academic_data['date'] = pd.to_datetime(academic_data['date'], format='%d/%m/%Y').dt.date

# Extend workload over the previous 5 days
extended_workload = []
for _, row in academic_data.iterrows():
    for offset in range(0, 10):  # Include current day and 5 previous days
        extended_workload.append({'date': row['date'] - pd.Timedelta(days=offset), 'workload': 1})

extended_workload = pd.DataFrame(extended_workload)
daily_workload = extended_workload.groupby('date').sum().reset_index()

# Merge datasets
merged_data = pd.merge(daily_chess, daily_workload, on='date', how='outer').fillna(0)

# Scatter plot: Academic workload vs Chess usage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='workload', y='chess_count', data=merged_data, alpha=0.7)
plt.xlabel('Academic Workload')
plt.ylabel('Chess Usage (Game Count)')
plt.title('Academic Workload vs Chess Usage')
plt.grid(True)
plt.show()

# Line plots for workload, chess usage, and win rate
plt.figure(figsize=(12, 6))
plt.plot(merged_data['date'], merged_data['workload'], label='Workload', marker='o')
plt.plot(merged_data['date'], merged_data['chess_count'], label='Chess Usage', marker='x')
plt.plot(merged_data['date'], merged_data['win_rate'], label='Win Rate', marker='^')
plt.xlabel('Date')
plt.ylabel('Counts/Win Rate')
plt.title('Trends in Workload, Chess Usage, and Win Rate Over Time')
plt.legend()
plt.grid(True)
plt.show()

# Boxplot: Workload grouped by win/lose
plt.figure(figsize=(10, 6))
sns.boxplot(x='workload', y='win_rate', data=merged_data)
plt.xlabel('Academic Workload')
plt.ylabel('Win Rate')
plt.title('Workload vs Win Rate')
plt.grid(True)
plt.show()

# Heatmap: Correlation matrix
correlation_matrix = merged_data[['workload', 'chess_count', 'win_rate']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix for Workload, Chess Usage, and Win Rate')
plt.show()

# Statistical Analysis
corr_usage, p_usage = pearsonr(merged_data['workload'], merged_data['chess_count'])
corr_win_rate, p_win_rate = pearsonr(merged_data['workload'], merged_data['win_rate'])

print(f"Correlation between workload and chess usage: {corr_usage:.2f} (p={p_usage:.2e})")
print(f"Correlation between workload and win rate: {corr_win_rate:.2f} (p={p_win_rate:.2e})")

# Interpretation of Results
if p_usage < 0.05:
    print("Significant correlation between workload and chess usage.")
    if corr_usage > 0:
        print("Hypothesis supported: Chess usage increases with workload.")
    else:
        print("Hypothesis not supported: Chess usage decreases with workload.")
else:
    print("No significant correlation between workload and chess usage.")

if p_win_rate < 0.05:
    print("Significant correlation between workload and win rate.")
    if corr_win_rate > 0:
        print("Higher workload correlates with better win rates.")
    else:
        print("Higher workload correlates with worse win rates.")
else:
    print("No significant correlation between workload and win rate.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# Load the data
# Load the data
chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)

# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
chess_data['win'] = chess_data['result_standardized'].apply(lambda x: 1 if x == 'win' else 0)

# Aggregate chess data by day
daily_chess = chess_data.groupby('date').agg(
    chess_count=('url', 'size'),
    win_rate=('win', 'mean')
).reset_index()

# Preprocess academic data
academic_data['date'] = pd.to_datetime(academic_data['date'], format='%d/%m/%Y').dt.date

# Extend workload over the previous 5 days
extended_workload = []
for _, row in academic_data.iterrows():
    for offset in range(0, 10):  # Include current day and 5 previous days
        extended_workload.append({'date': row['date'] - pd.Timedelta(days=offset), 'workload': 1})

extended_workload = pd.DataFrame(extended_workload)
daily_workload = extended_workload.groupby('date').sum().reset_index()

# Merge datasets
merged_data = pd.merge(daily_chess, daily_workload, on='date', how='outer').fillna(0)

# 1. Workload vs Chess Usage
plt.figure(figsize=(8, 5))
sns.scatterplot(x='workload', y='chess_count', data=merged_data, alpha=0.7)
plt.xlabel('Workload')
plt.ylabel('Chess Usage (Games Played)')
plt.title('Workload vs Chess Usage')
plt.grid(True)
plt.show()

# Correlation
corr_usage, p_usage = pearsonr(merged_data['workload'], merged_data['chess_count'])
print(f"Correlation between workload and chess usage: {corr_usage:.2f} (p={p_usage:.2e})")

# 2. Workload vs Win Rate
plt.figure(figsize=(8, 5))
sns.scatterplot(x='workload', y='win_rate', data=merged_data, alpha=0.7)
plt.xlabel('Workload')
plt.ylabel('Win Rate')
plt.title('Workload vs Win Rate')
plt.grid(True)
plt.show()

# Correlation
corr_win_rate, p_win_rate = pearsonr(merged_data['workload'], merged_data['win_rate'])
print(f"Correlation between workload and win rate: {corr_win_rate:.2f} (p={p_win_rate:.2e})")

# 3. Trend of Workload Over Time
plt.figure(figsize=(10, 5))
plt.plot(merged_data['date'], merged_data['workload'], label='Workload', marker='o')
plt.xlabel('Date')
plt.ylabel('Workload')
plt.title('Workload Trend Over Time')
plt.grid(True)
plt.show()

# 4. Trend of Chess Usage Over Time
plt.figure(figsize=(10, 5))
plt.plot(merged_data['date'], merged_data['chess_count'], label='Chess Usage', marker='x', color='orange')
plt.xlabel('Date')
plt.ylabel('Chess Usage (Games Played)')
plt.title('Chess Usage Trend Over Time')
plt.grid(True)
plt.show()

# 5. Trend of Win Rate Over Time
plt.figure(figsize=(10, 5))
plt.plot(merged_data['date'], merged_data['win_rate'], label='Win Rate', marker='^', color='green')
plt.xlabel('Date')
plt.ylabel('Win Rate')
plt.title('Win Rate Trend Over Time')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
chess_data = pd.read_csv(chess_url)
academic_data = pd.read_csv(academic_url)

# Preprocess chess data
chess_data['end_time'] = pd.to_datetime(chess_data['end_time'])
chess_data['date'] = chess_data['end_time'].dt.date
chess_data['win'] = chess_data['result_standardized'].apply(lambda x: 1 if x == 'win' else 0)

# Aggregate chess data by day
daily_chess = chess_data.groupby('date').agg(
    chess_count=('url', 'size'),
    win_rate=('win', 'mean')
).reset_index()

# Preprocess academic data with date parsing
academic_data['date'] = pd.to_datetime(academic_data['date'], format='%d/%m/%Y').dt.date

# Extend workload over the previous 5 days
extended_workload = []
for _, row in academic_data.iterrows():
    for offset in range(0, 10):  # Include current day and 5 previous days
        extended_workload.append({'date': row['date'] - pd.Timedelta(days=offset), 'workload': 1})

extended_workload = pd.DataFrame(extended_workload)
daily_workload = extended_workload.groupby('date').sum().reset_index()

# Merge datasets
# Ensure the 'date' column in merged_data is a pandas datetime object
merged_data['date'] = pd.to_datetime(merged_data['date'])

# Filter for Fall and Spring terms
fall_and_spring = merged_data[
    ((merged_data['date'].dt.month >= 9) & (merged_data['date'].dt.month <= 12)) |  # Fall months
    ((merged_data['date'].dt.month >= 1) & (merged_data['date'].dt.month <= 5))    # Spring months
]

# Visualization 1: Box Plot for Workload vs Chess Usage (Filtered Terms)
plt.figure(figsize=(10, 6))
sns.boxplot(x='workload', y='chess_count', data=fall_and_spring)
plt.xlabel('Workload')
plt.ylabel('Chess Usage (Games Played)')
plt.title('Box Plot: Workload vs Chess Usage (Fall & Spring Terms)')
plt.grid(axis='y')
plt.show()

# Visualization 2: Bar Plot for Average Chess Usage by Workload Level (Filtered Terms)
average_usage = fall_and_spring.groupby('workload')['chess_count'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(x='workload', y='chess_count', data=average_usage, palette='Blues_d')
plt.xlabel('Workload')
plt.ylabel('Average Chess Usage (Games Played)')
plt.title('Bar Plot: Average Chess Usage by Workload (Fall & Spring Terms)')
plt.grid(axis='y')
plt.show()

