# Cris Working Notebook
___

# 1.10 Friday - Data Mapping

In [2]:
import pandas as pd

In [None]:
train = pd.read_csv("train.csv")
labels = pd.read_csv("train_labels.csv")
specs = pd.read_csv("specs.csv")

In [None]:
print(f"dataset: {len(train)}")
print(f"event_id: {len(train.event_id)}")
print(f"game_session: {len(train.game_session)}")
print(" ")
print(f"unique event_codes: {len(train.event_code.unique())}")
print(f"unique title: {len(train.title.unique())}")

In [None]:
train.head()

In [None]:
train[train.world == "MAGMAPEAK"].groupby(["type","title"])["world"].count()

In [None]:
magma = train[train.world == "MAGMAPEAK"]

In [None]:
print(f"{train.type.value_counts()}")
print(len(train.event_id))

In [None]:
len(train.event_id.unique())

In [None]:
len(train.title.unique())

In [None]:
len(train.event_code.unique())

In [None]:
train.event_code.value_counts()

In [None]:
bdf = pd.merge(train,specs,on="event_id")

In [None]:
df = bdf[["event_code","event_id","info"]].sample(100_000)

In [None]:
df[df.event_code==4035]

In [None]:
labels.head()

---

# 1.13 Monday - Hypothesis Testing

## Summary of Insights:
1. There's a Very Weak Negative Correlation between the no. of assessments taken by a user, and their average accuracy.
 - This tells me that the accuracy is not about the amount of times assessments are taken.
 - There are distinct low, average, high performers - why?

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

In [None]:
# big = pd.read_csv("big.csv")

In [None]:
big.head()

## Are the of Assessment Taken and Average Accuracy Correlated to Each Other?

### Look at data

In [None]:
print("Accuracy Measures:")
sorted(big.accuracy.unique())

In [None]:
plt.hist(x="accuracy",data=big)
plt.show()

In [None]:
print(f"no. of unique installation ids: {len(big.installation_id.unique())}")

### Trim data for analysis

In [None]:
assessment = big.groupby("installation_id")["accuracy"].agg(["count","mean"])

In [None]:
assessment.head()

In [None]:
assessment.columns

In [None]:
assessment.columns = ["no_assessment","ave_accuracy"]
assessment.head()

In [None]:
assessment.describe()

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(x="no_assessment",y="ave_accuracy",data=assessment[assessment.no_assessment <= 280])

plt.title("Number of Assessments Taken and the corresponding Average Accuracy")
plt.xlabel("Assesment Taken")
plt.ylabel("Ave Accuracy")

It seems like there are distinct low, average, high performers, that the amount of assessments they take does not matter. From the visualization, it does not seem like there is a correlation between the total no. of assessments taken, and the average accuracy. Nevertheless, let's do some statistical test to prove this.

### Plan:
1. Take 10 random samples and get the pearson's r for the two variables, `no_assessment` and `ave_accuracy`.
    - Get 1084 observations per sample, which is 30% of the population (3614)
2. Get the mean of the pearson's r of these 5 random samples to see the strength of correlation.

In [None]:
s1 = assessment.sample(frac=.20)
s2 = assessment.sample(frac=.30)
s3 = assessment.sample(frac=.40)
s4 = assessment.sample(frac=.50)
s5 = assessment.sample(frac=.60)

Check that these samples are random.

In [None]:
s1.head()

In [None]:
s2.head()

In [None]:
print(f"shape of sample 3: {s3.shape}")
print(f"shape of sample 4: {s4.shape}")
print(f"shape of sample 5: {s5.shape}")

In [None]:
r1, pval1 = stats.pearsonr(s1.no_assessment, s1.ave_accuracy)
r2, pval2 = stats.pearsonr(s2.no_assessment, s2.ave_accuracy)
r3, pval3 = stats.pearsonr(s3.no_assessment, s3.ave_accuracy)
r4, pval4 = stats.pearsonr(s4.no_assessment, s4.ave_accuracy)
r5, pval5 = stats.pearsonr(s5.no_assessment, s5.ave_accuracy)

In [None]:
samples = [s1,s2,s3,s4,s5]
sample_size = [len(sample) for sample in samples]

In [None]:
pearsons_r = [r1,r2,r3,r4,r5]
pvalue = [pval1,pval2, pval3, pval4, pval5]

In [None]:
metric_table = pd.DataFrame({"sample":["s1","s2","s3","s4","s5"],"pearsons_r":pearsons_r,"pvalue":pvalue,"sample_size":sample_size})

In [None]:
metric_table.head()

In [None]:
print(f"Average Pearsons R: {metric_table.pearsons_r.mean()}")
print("Very Weak Negative Relationship")

## What's the relationship between the type of activity engage in and their accuracy score? Are the users who did all three activities (game, activity, clip) in combination have higher accuracy scores?

In [3]:
import pandas as pd
import wrangle

In [None]:
test_takers = wrangle.get_assessment_users(cache=False)

In [None]:
bigger.head()

In [None]:
activity = bigger[["installation_id","type","accuracy"]]
activity.head()

In [None]:
activity["c"] = np.where(activity.type=="Clip", 1, 0)
activity["a"] = np.where(activity.type=="Activity", 1, 0)
activity["g"] = np.where(activity.type=="Game", 1, 0)

In [None]:
def cg_agg(df,c,a):
    if

In [None]:
act_agg = activity.groupby("installation_id").agg(sum)

In [None]:
activity.sample(20)

In [None]:
activity[activity.cag==1]

In [None]:
get_assessment_users(cache=True):