<a href="https://colab.research.google.com/github/bchandayo/gtk/blob/master/Friends_EDA_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'friends-tv-series-screenplay-script:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F873061%2F1487503%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240211%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240211T024816Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1456cde0246c5d7e4eff6cc24eb6839a0bf1aa9fd2b95ffe292e350ad0045a3b60f514e8f899631fd3a51944c08c09e98e370d96b14c048fbb5e05b7901f53d4a1575150638f9bf9bb242c7c60baecee97602bf83af16ed64689f14cb2436312a8d86b22e7cff6886013d03008fc75a3c80c7937d657144f100eafba8f334da63bee7600f8ffb0f39def0723fad810b78588de36b1724ed19f313063c59390056fac25dc9c01883c5f673f0e9bc7626008b1a257819c894b285ebf2b20c992859eb2e73eeee1071170177f407b293a74c2ff76141d49a763cbd468390944437b59aaa9c9b95f77ac41bd54279ee4cbab41fb3cdfc29d5d5c7d1aa9f791140967'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Introductory EDA on Friends

## 1. Import Libraries

In [None]:
import re
import os
import nltk
import spacy
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from wordcloud import WordCloud

from nltk import word_tokenize
from nltk.util import ngrams

In this notebook we will produce a basic analysis for Friends transcripts and hopefully get a few insights. So, pour yourself a damn fine cup of coffee and bear with me!

## 2. Data loading and preprocessing

In [None]:
# count the number of episodes in each season
ep_num = len([name for name in os.listdir('../input/friends-tv-series-screenplay-script')])


print("Friends Season consists of {} episodes.".format(ep_num))

In [None]:
import glob
texts = ""
folder_name = "../input/friends-tv-series-screenplay-script/"
for f in glob.glob(folder_name + '/*.txt'):
    temp = open(f,'r')
    texts += temp.read()
    temp.close()

In [None]:
len(texts)

In [None]:
text = re.sub('[^A-Za-z]+', ' ', texts)

In [None]:
# adding screenplay notes to stopwords
nlp = spacy.load("en")
nlp.Defaults.stop_words |= {"d","ll","m","re","s","ve", "t", "oh", "uh", "na", "okay",
                           "didn","don","gon","j","hm","um","dr","room","int", "ext",
                           "cut", "day", "night", "theme", "tune","music", "ends","view","opening credits scene",
                            "commercial break scene", "hey hey hey", "hey", "closing credits scene","scene",
                            "closeup", 'freshly', 'squeezed', 'fade'}
stopwords = nlp.Defaults.stop_words

## 3. Most frequent words

In [None]:
# function to find and plot frequent words
def plot_words(words,title,color="#114d1e"):
    counts = {}
    for i in range(len(words)):
        counts[words[i][0]] = words[i][1]
    plt.figure(figsize=(8,6))
    plt.title(title, fontsize=14)
    plt.barh(range(len(counts)), list(counts.values()), color=color, align="center")
    plt.yticks(range(len(counts)), list(counts.keys()), fontsize=12)
    plt.gca().invert_yaxis()
    plt.show()

def plot_ngrams(ngrams,title,color="#7a2822"):
    counts = {}
    for i in range(len(ngrams)):
        counts[" ".join(ngrams[i][0])] = ngrams[i][1]
    plt.figure(figsize=(8,6))
    plt.title(title, fontsize=14)
    plt.barh(range(len(counts)), list(counts.values()), color=color,align="center")
    plt.yticks(range(len(counts)), list(counts.keys()), fontsize=12)
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
all_words = nltk.tokenize.word_tokenize(text.lower())
all_words_no_stop = nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)
plot_words(all_words_no_stop.most_common(10), "Top 10 frequent words")

Unsuprisingly, these are the **names of the main characters**.

Now let's get the most frequent bigrams and bigrams, i.e. the sequences of two and three neighbouring words respectively.

In [None]:
bigram = nltk.FreqDist(nltk.bigrams(w.lower() for w in all_words if w not in stopwords))
plot_ngrams(bigram.most_common(10), "Top 10 frequent bigrams.")

In [None]:
trigrams = nltk.FreqDist(nltk.trigrams(w.lower() for w in all_words if w not in stopwords))
plot_ngrams(trigrams.most_common(10), "Top 10 frequent trigrams.", "#2b2e2b")

What will change if we remove the names?

In [None]:
characters = [
'monica','rachel','ross','joey','chandler','phoebe','central perk',"opening credits scene",
"commercial break scene", "hey hey hey", "hey", "closing credits scene","scene"]

# unique names only
names = set(" ".join(set(characters)).lower().split())

nlp.Defaults.stop_words |= names

In [None]:
no_names = nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)
plot_words(no_names.most_common(10), "Top 10 frequent words except for names")

In [None]:
no_names_bigram = nltk.FreqDist(nltk.bigrams(w.lower() for w in all_words if w not in stopwords))
plot_ngrams(no_names_bigram.most_common(10), "Top 10 frequent bigrams except for names")

In [None]:
no_names_trigram = nltk.FreqDist(nltk.trigrams(w.lower() for w in all_words if w not in stopwords))
plot_ngrams(no_names_trigram.most_common(10), "Top 10 frequent trigrams except for names", "#2b2e2b")

## Word Cloud for Season 1

In [None]:
# the mask image taken from http://www.designcenterassoc.com/wp-content/uploads/2017/11/Friends-PNG-HD-e1509653607131.png
# cooper_mask = np.array(Image.open('../input/temporary/Friends-PNG-HD-e1509653607131.png'))

def color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 100%, 27%)"

wc = WordCloud(background_color="white", max_words=1000,
               stopwords=stopwords, contour_width=4, contour_color='steelblue')

wc.generate(" ".join(all_words_no_stop.keys()))

plt.figure(figsize=(18, 10))
plt.imshow(wc.recolor(color_func=color_func, random_state=3),interpolation="bilinear")
plt.axis("off")

## How many times coffee was mentioned?

In [None]:
"Well, exactly {} times".format(all_words_no_stop['coffee'])

## And what about a famous How you doin?

In [None]:
"It was mentioned {} times throughout all episodes".format(all_words_no_stop['doin'])

## 4. What's next?
It would be great to do some sentiment analysis on the scripts.