# Transcript EDA
- base cleaning on the transcripts has been done
- next steps are to see what other cleaning needs to be done, and explore what we have

In [3]:
# Importing modules
import pandas as pd
import numpy as np

In [64]:
acoc_df = pd.read_csv('../transcripts/csv/acoc_df.csv')
aso_df = pd.read_csv('../transcripts/aso_df.csv')
fhfy_df = pd.read_csv('../transcripts/fhfy_df.csv')
fhsy_df = pd.read_csv('../transcripts/fhsy_df.csv')
tuc_df = pd.read_csv('../transcripts/tuc_df.csv')
tuc2_df = pd.read_csv('../transcripts/tuc2_df.csv')

In [65]:
# renaming columns
acoc_df.rename(columns={'0':'Line'}, inplace=True)
aso_df.rename(columns={'0':'Line'}, inplace=True)
fhfy_df.rename(columns={'0':'Line'}, inplace=True)
fhsy_df.rename(columns={'0':'Line'}, inplace=True)
tuc_df.rename(columns={'0':'Line'}, inplace=True)
tuc2_df.rename(columns={'0':'Line'}, inplace=True)

In [34]:
# starting with acoc

In [66]:
# splitting columns into player & line
acoc_df_split = acoc_df['Line'].str.split(':', n=1, expand=True)

In [68]:
# dropping nans
acoc_df_split.dropna(inplace=True)

In [79]:
# renaming columns
acoc_df_split.rename(columns={0:'Player',1:'Line'}, inplace=True)

In [89]:
# looking only for natural 20s
acoc_20s = acoc_df_split.loc[acoc_df_split['Line'].str.contains('nat 20|natural 20', na=False, case=False)]

In [103]:
# removing instances of Brennan (DM) or cocked rolls
acoc_20s = acoc_20s[acoc_20s['Player'].str.contains('Brennan')==False]
acoc_20s = acoc_20s[acoc_20s['Line'].str.contains('cocked|Cocked')==False]

In [109]:
# resseting index
acoc_20s.reset_index(drop=True, inplace=True)

In [111]:
acoc_20s.head()

Unnamed: 0,Player,Line
0,Siobhan,Nat 20.
1,Murph,Nat 20!
2,Ally,Nat 20!
3,Emily,Two nat 20s on the first attack!
4,Emily,"Oh, I've never wanted a nat 20 more!"


- it looks like some of these lines are about the act of getting nat 20s
- not necessarily saying they got them
- this may take some manual review

In [131]:
# creating a cleaning function for finding nat 20s
def find_20s(df):
    # splitting columns
    df = df['Line'].str.split(':', n=1, expand=True)
    # dropping nans
    df.dropna(inplace=True)
    # renaming columns
    df.rename(columns={0:'Player',1:'Line'}, inplace=True)
    # looking only for natural 20s
    df = df.loc[df['Line'].str.contains('nat 20|natural 20', na=False, case=False)]
    # removing instances of Brennan (DM) or cocked rolls
    df = df[df['Player'].str.contains('Brennan')==False]
    df = df[df['Line'].str.contains('cocked|Cocked')==False]
    df.reset_index(drop=True, inplace=True)
    return df

In [126]:
# testing
acoc_testing = acoc_df.copy(deep=True)

In [127]:
acoc_testing.head()

Unnamed: 0,Line
0,Published using Google Docs
1,[Learn More](https://support.google.com/docs/a...
2,Episode 1: There is Strength in Sweetness
3,Updated automatically every 5 minutes
4,Dimension 20 Season 5


In [128]:
acoc_testing = find_20s(acoc_testing)

In [130]:
acoc_testing

Unnamed: 0,Player,Line
0,Siobhan,Nat 20.
1,Murph,Nat 20!
2,Ally,Nat 20!
3,Emily,Two nat 20s on the first attack!
4,Emily,"Oh, I've never wanted a nat 20 more!"
...,...,...
111,Ally,"All right, second attack. Nat 20!"
112,Zac,Nat 20. [putting his hand over his mouth]
113,Lou,That's a nat 20.
114,Lou,"Also, just a fun fact, for those at home, thi..."


In [None]:
aso_df
fhfy_df
fhsy_df
tuc_df
tuc2_df

In [132]:
# time to use this function on the other seasons
aso_20s = find_20s(aso_df)
fhfy_20s = find_20s(fhfy_df)
fhsy_20s = find_20s(fhsy_df)
tuc_20s = find_20s(tuc_df)
tuc2_20s = find_20s(tuc2_df)

In [135]:
# exporting for manual review
acoc_20s.to_csv('eda_csvs/acoc_20s.csv', index=False)
aso_20s.to_csv('eda_csvs/aso_20s.csv', index=False)
fhfy_20s.to_csv('eda_csvs/fhfy_20s.csv', index=False)
fhsy_20s.to_csv('eda_csvs/fhsy_20s.csv', index=False)
tuc_20s.to_csv('eda_csvs/tuc_20s.csv', index=False)
tuc2_20s.to_csv('eda_csvs/tuc2_20s.csv', index=False)

In [139]:
# repeating above for critical failures
def find_1s(df):
    # splitting columns
    df = df['Line'].str.split(':', n=1, expand=True)
    # dropping nans
    df.dropna(inplace=True)
    # renaming columns
    df.rename(columns={0:'Player',1:'Line'}, inplace=True)
    # looking only for natural 1s
    df = df.loc[df['Line'].str.contains('nat 1|natural 1', na=False, case=False)]
    # removing instances of Brennan (DM) or cocked rolls
    df = df[df['Player'].str.contains('Brennan')==False]
    df = df[df['Line'].str.contains('cocked|Cocked')==False]
    df.reset_index(drop=True, inplace=True)
    return df

In [142]:
acoc_1s = find_1s(acoc_df)
aso_1s = find_1s(aso_df)
fhfy_1s = find_1s(fhfy_df)
fhsy_1s = find_1s(fhsy_df)
tuc_1s = find_1s(tuc_df)
tuc2_1s = find_1s(tuc2_df)

In [None]:
# exporting for manual review
acoc_1s.to_csv('eda_csvs/acoc_1s.csv', index=False)
aso_1s.to_csv('eda_csvs/aso_1s.csv', index=False)
fhfy_1s.to_csv('eda_csvs/fhfy_1s.csv', index=False)
fhsy_1s.to_csv('eda_csvs/fhsy_1s.csv', index=False)
tuc_1s.to_csv('eda_csvs/tuc_1s.csv', index=False)
tuc2_1s.to_csv('eda_csvs/tuc2_1s.csv', index=False)

### Taking a look at our critical successes

In [9]:
acoc_20s = pd.read_csv('eda_csvs/acoc_20s.csv')
aso_20s = pd.read_csv('eda_csvs/aso_20s.csv')
fhfy_20s = pd.read_csv('eda_csvs/fhfy_20s.csv')
fhsy_20s = pd.read_csv('eda_csvs/fhsy_20s.csv')
tuc_20s = pd.read_csv('eda_csvs/tuc_20s.csv')
tuc2_20s = pd.read_csv('eda_csvs/tuc2_20s.csv')

In [49]:
acoc_20s_df = acoc_20s.groupby(['Player']).count()
acoc_20s_df = acoc_20s_df.sort_values('Line', ascending=False)
acoc_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Ally,20
Siobhan,18
Murph,12
Emily,9
Lou,9
Zac,9


In [44]:
aso_20s_df = aso_20s.groupby(['Player']).count()
aso_20s_df = aso_20s_df.sort_values('Line', ascending=False)
aso_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Emily,18
Zac,15
Lou,13
Murph,13
Ally,9
Siobhan,3


In [45]:
fhfy_20s_df = fhfy_20s.groupby(['Player']).count()
fhfy_20s_df = fhfy_20s_df.sort_values('Line', ascending=False)
fhfy_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Emily,9
Murph,9
Zac,9
Siobhan,8
Ally,7
Lou,5


In [46]:
fhsy_20s_df = fhsy_20s.groupby(['Player']).count()
fhsy_20s_df = fhsy_20s_df.sort_values('Line', ascending=False)
fhsy_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Ally,18
Murph,14
Siobhan,13
Emily,11
Lou,9
Zac,6


In [47]:
tuc_20s_df = tuc_20s.groupby(['Player']).count()
tuc_20s_df = tuc_20s_df.sort_values('Line', ascending=False)
tuc_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Lou,11
Ally,9
Emily,9
Siobhan,6
Zac,5
Murph,4
Zac,1


In [48]:
tuc2_20s_df = tuc2_20s.groupby(['Player']).count()
tuc2_20s_df = tuc2_20s_df.sort_values('Line', ascending=False)
tuc2_20s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Zac,15
Ally,8
Siobhan,7
Emily,6
Lou,4
Murph,2


### Taking a look at our critical failures

In [43]:
acoc_1s = pd.read_csv('eda_csvs/acoc_1s.csv')
aso_1s = pd.read_csv('eda_csvs/aso_1s.csv')
fhfy_1s = pd.read_csv('eda_csvs/fhfy_1s.csv')
fhsy_1s = pd.read_csv('eda_csvs/fhsy_1s.csv')
tuc_1s = pd.read_csv('eda_csvs/tuc_1s.csv')
tuc2_1s = pd.read_csv('eda_csvs/tuc2_1s.csv')

In [50]:
acoc_1s_df = acoc_1s.groupby(['Player']).count()
acoc_1s_df = acoc_1s_df.sort_values('Line', ascending=False)
acoc_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Siobhan,11
Ally,10
Murph,7
Emily,5
Lou,5
Zac,1


In [51]:
aso_1s_df = aso_1s.groupby(['Player']).count()
aso_1s_df = aso_1s_df.sort_values('Line', ascending=False)
aso_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1


In [52]:
fhfy_1s_df = fhfy_1s.groupby(['Player']).count()
fhfy_1s_df = fhfy_1s_df.sort_values('Line', ascending=False)
fhfy_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Ally,10
Murph,8
Zac,8
Siobhan,6
Lou,3
Emily,2


In [53]:
fhsy_1s_df = fhsy_1s.groupby(['Player']).count()
fhsy_1s_df = fhsy_1s_df.sort_values('Line', ascending=False)
fhsy_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Zac,11
Lou,9
Emily,7
Siobhan,5
Murph,3
Ally,2


In [54]:
tuc_1s_df = tuc_1s.groupby(['Player']).count()
tuc_1s_df = tuc_1s_df.sort_values('Line', ascending=False)
tuc_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Ally,6
Emily,6
Siobhan,5
Zac,4
Lou,2
Murph,2


In [55]:
tuc2_1s_df = tuc2_1s.groupby(['Player']).count()
tuc2_1s_df = tuc2_1s_df.sort_values('Line', ascending=False)
tuc2_1s_df

Unnamed: 0_level_0,Line
Player,Unnamed: 1_level_1
Siobhan,9
Murph,4
Zac,4
Ally,3
Emily,3
Lou,3


In [56]:
# exporting csvs

In [57]:
acoc_20s_df.to_csv('grouped_csvs/acoc_20s_grouped.csv', index=False)
aso_20s_df.to_csv('grouped_csvs/aso_20s_grouped.csv', index=False)
fhfy_20s_df.to_csv('grouped_csvs/fhfy_20s_grouped.csv', index=False)
fhsy_20s_df.to_csv('grouped_csvs/fhsy_20s_grouped.csv', index=False)
tuc_20s_df.to_csv('grouped_csvs/tuc_20s_grouped.csv', index=False)
tuc2_20s_df.to_csv('grouped_csvs/tuc2_20s_grouped.csv', index=False)
acoc_1s_df.to_csv('grouped_csvs/acoc_1s_grouped.csv', index=False)
aso_1s_df.to_csv('grouped_csvs/aso_1s_grouped.csv', index=False)
fhfy_1s_df.to_csv('grouped_csvs/fhfy_1s_grouped.csv', index=False)
fhsy_1s_df.to_csv('grouped_csvs/fhsy_1s_grouped.csv', index=False)
tuc_1s_df.to_csv('grouped_csvs/tuc_1s_grouped.csv', index=False)
tuc2_1s_df.to_csv('grouped_csvs/tuc2_1s_grouped.csv', index=False)