In [None]:
import io, os, sys, types

In [None]:
!{sys.executable} -m pip install regex
!{sys.executable} -m pip install squarify
!{sys.executable} -m pip install wordcloud

In [None]:
import matplotlib
import numpy as np
import pandas as pd
import plotly.express as px
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Data/season1/e1.txt', sep=":", error_bad_lines=False, warn_bad_lines=False)

# 1. Data Exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.person.unique()

# 2. Dataframe formatting

### Drop empty values

In [None]:
df = df.dropna()  
# Rename columns
df.columns = ["person", "line"]

### Make all characters lowercase

In [None]:
df['person'] = df['person'].str.lower()
df['line'] = df['line'].str.lower()

## Clean the column with the name of the character 
We need to clean the names in the first column of the dataframe 
* JON/ROBB           ==> JON
* JON (to BRAN)      ==> JON

For this we use **Regex**. 
Regex stands for  **regular expression**. It is a sequence of characters that define a search pattern. Usually such patterns are used by string searching algorithms for "find" or "find and replace" operations on strings, or for input validation.

You can find basic examples of Regex here: https://cs.lmu.edu/~ray/notes/regex/

In [None]:
import regex as re

In [None]:
df['person'] = df['person'].str.replace('/',' ') # Replace character for another
df['person']=df['person'].str.replace(r"\(.*\)","") # Replace everything between  brackets
df['person']=df['person'].str.replace(r'^(\s*(?:\S+\s+){1})\S+',r'\1') # Delete last name

### Get rid of spaces that might be left before or after a name

In [None]:
df['person'] = df['person'].str.strip()

## Clean the 'line' column

In [None]:
df['line'] = df['line'].str.replace(',','')
df['line'] = df['line'].str.replace('.','')

### Now we print the resulting dataframe to make sure that it is consistent and that we can start the analysis

In [None]:
df.head()

# Count !? repetitions per character

1. Use the **Lamba** function to count the ! repetitions per line and save them in a new column
2. Use the **Lamba** function to count the ? repetitions per line and save them in a new column
3. Use the **Groupby** function to group and sum the ?! used **by character**
4. Plot the bar chart

In [None]:
df['exc'] = df['line'].map(lambda x: x.count("!"))

In [None]:
df['ques'] = df['line'].map(lambda x: x.count("?"))

In [None]:
df1 = df.groupby(['person'])['ques'].sum().reset_index()

In [None]:
fig = px.bar(df1,
             x="person",
             y="ques")
fig.update_layout(xaxis={'categoryorder':'total descending'})

In [None]:
df['voice'] = df['ques']+df['exc']
df2 = df.groupby(['person'])['voice'].sum().reset_index()

fig = px.bar(df2,
             x="person",
             y="voice")
fig.update_layout(xaxis={'categoryorder':'total descending'})

## Count the most repeated words of a specific character

1. Filter the dataframe and keep only the lines spoken by each character
2. Create a string with all the words 
3. Use the Counter() function to count the repetitions and save them in a DATAFRAME
4. Convert the dataframe into a dictionary with the words and their frequency

In [None]:
df3 = df[df['person']  == 'jon']

In [None]:
words = ''
for i in df3.line.values:
    words += '{} '.format(i.lower()) # save all words in a string

In [None]:
wd = pd.DataFrame(Counter(words.split()).most_common(200), columns=['word', 'frequency'])
wd = wd.iloc[50:]

In [None]:
data = dict(zip(wd['word'].tolist(), wd['frequency'].tolist()))

## Plot word cloud
        **ADD WORDCLOUD DOCUMENTATION** 

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
# print(STOPWORDS) # Words we are blocking out

In [None]:
wc = WordCloud(background_color='white',
                stopwords=STOPWORDS,
                width=800,
                height=400,
                max_words=100).generate_from_frequencies(data)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation='bilinear')

plt.axis('off')

# Tree Map

    ** ADD TREE MAP DOCUMENTATION **

Steps to format the data and plot the tree map:

1. Count the number of words spoken for each line of the script
2. Group and sum the words spoken by each character 
3. Transform all the data to List() type
4. Plot

In [None]:
df['words'] = [len(x.split()) for x in df['line'].tolist()]

In [None]:
df2 = df.groupby(['person'])['words'].sum().reset_index()

In [None]:
x1=pd.Series(df2['person'])
x2=pd.Series(df2['words'])
x2=x2.tolist()
x1=x1.tolist()

In [None]:
import squarify
squarify.plot(sizes=x2, label=x1,alpha=.7,bar_kwargs=dict(linewidth=1, edgecolor="#222222") )