In [2]:
import numpy as np
import pandas as pd
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

# Reading Data

In [3]:
raw_dataset = pd.read_csv('office_data.csv')
raw_dataset

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False
5,6,1,1,2,"[on the phone] Yes, I'd like to speak to your ...",Michael,False
6,7,1,1,3,"I've, uh, I've been at Dunder Mifflin for 12 y...",Michael,False
7,8,1,1,3,Well. I don't know.,Pam,False
8,9,1,1,3,"If you think she's cute now, you should have s...",Michael,False
9,10,1,1,3,What?,Pam,False


In [4]:
#Create a dataframe containing the counts of each word in each line
raw_dataset['speaker'].unique()

array(['Michael', 'Jim', 'Pam', 'Dwight', 'Jan', 'Michel', 'Todd Packer',
       'Phyllis', 'Stanley', 'Oscar', 'Angela', 'Kevin', 'Ryan', 'Man',
       'Roy', 'Documentary Crew Member', 'Mr. Brown', 'Toby', 'Kelly',
       'Meredith', 'Travel Agent', 'Man on Phone', 'Everybody', 'Lonny',
       'Darryl', 'Teammates', 'Michael and Dwight', 'Warehouse worker',
       'Madge', 'Worker', 'Packer', 'Warehouse Worker', 'Katy',
       'Guy at bar', 'Other Guy at Bar', 'Guy At Bar', 'Pam and Jim',
       'Employee', "Chili's Employee", 'Waitress', 'Manager',
       "Kevin's computer", 'Warehouse Guy', 'Warehouse guy',
       'Warehouse guys', 'Video', 'Man in Video', 'Actor',
       'Redheaded Actress', "Mr. O'Malley", 'Albiny', "Pam's Mom", 'Carol',
       'Bill', 'Everyone', 'Crowd', 'song', 'Song', 'Dwight and Michael',
       'Sherri', 'Creed', 'Devon', 'Children', 'Kid',
       'Vance Refrigeration Worker #1', 'Vance Refrigeration Worker #2',
       'Hank the Security Guard', 'Ira', "Rya

In [5]:
cleaned_data = raw_dataset.loc[raw_dataset['speaker'].isin(['Michael', 'Jim', 'Pam', 'Dwight', 'Jan', 'Michel', 'Todd Packer',
       'Phyllis', 'Stanley', 'Oscar', 'Angela', 'Kevin', 'Ryan', 'Roy', 'Toby', 'Kelly',
       'Meredith', 'Darryl', 'Packer'])]
cleaned_data

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False
5,6,1,1,2,"[on the phone] Yes, I'd like to speak to your ...",Michael,False
6,7,1,1,3,"I've, uh, I've been at Dunder Mifflin for 12 y...",Michael,False
7,8,1,1,3,Well. I don't know.,Pam,False
8,9,1,1,3,"If you think she's cute now, you should have s...",Michael,False
9,10,1,1,3,What?,Pam,False


In [13]:
episode_data = cleaned_data.groupby(['season', 'episode', 'speaker', 'line_text']).sum()
episode_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,id,scene,deleted
season,episode,speaker,line_text,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,Angela,I bet it's gonna be me. Probably gonna be me.,68,14,0.0
1,1,Angela,"I have a lot of cat figurines on my desk. I think cats, really, are old souls. And, uh, Phyllis thinks so too.",253,46,1.0
1,1,Angela,"My name is Angela and um, I'm in charge of the accounting department. There are three of us, Kevin and Oscar.",246,46,1.0
1,1,Angela,"Not very worried about downsizing right now. Because, uh, I think you've met Kevin and um...",251,46,1.0
1,1,Angela,"Oscar, how do I describe him. He's like a stapler. Do I need a stapler? Yes. But, I'm still the one that has to push it down.",248,46,1.0
1,1,Dwight,Damn it! Jim!,147,30,0.0
1,1,Dwight,Downsizing?,95,19,0.0
1,1,Dwight,"Downsizing? I have no problem with that. I have been recommending downsizing since I first got here. I even brought it up in my interview. I say, bring it on.",96,20,0.0
1,1,Dwight,"Dwight Schrute, Assistant Regional Manager.",142,29,0.0
1,1,Dwight,"Dwight Schrute. My father's name, also Dwight Schrute. My grandfather's name, Dweide Schrude, Amish. That's my family. I don't know where they came, the Amish, came from originally. Uh, Amland.",230,40,1.0


In [15]:
episode_data_cleaned = episode_data.reset_index()
episode_data_cleaned

Unnamed: 0,season,episode,speaker,line_text,id,scene,deleted
0,1,1,Angela,I bet it's gonna be me. Probably gonna be me.,68,14,0.0
1,1,1,Angela,I have a lot of cat figurines on my desk. I th...,253,46,1.0
2,1,1,Angela,"My name is Angela and um, I'm in charge of the...",246,46,1.0
3,1,1,Angela,Not very worried about downsizing right now. B...,251,46,1.0
4,1,1,Angela,"Oscar, how do I describe him. He's like a stap...",248,46,1.0
5,1,1,Dwight,Damn it! Jim!,147,30,0.0
6,1,1,Dwight,Downsizing?,95,19,0.0
7,1,1,Dwight,Downsizing? I have no problem with that. I hav...,96,20,0.0
8,1,1,Dwight,"Dwight Schrute, Assistant Regional Manager.",142,29,0.0
9,1,1,Dwight,"Dwight Schrute. My father's name, also Dwight ...",230,40,1.0


Create a DataFrame containing the word frequencies per speaker per episode

In [29]:
d = []
temp = ''
curr_speaker = episode_data_cleaned['speaker'][0]

for i in np.arange(len(episode_data_cleaned['speaker'])):
    if curr_speaker == episode_data_cleaned['speaker'][i]:
        temp += episode_data_cleaned['line_text'][i]
    else:
        curr_speaker = episode_data_cleaned['speaker'][i]
        lines = temp.split()
        e = {x : lines.count(x) for x in lines}
        e['CLASS'] = episode_data_cleaned['speaker'][i - 1]
        d.append(e)
        temp = episode_data_cleaned['line_text'][i]
d

[{'And,': 1,
  'Angela': 1,
  'Because,': 1,
  'But,': 1,
  'CLASS': 'Angela',
  'Do': 1,
  "He's": 1,
  'I': 5,
  "I'm": 2,
  'Kevin': 2,
  'Oscar.Not': 1,
  'Phyllis': 1,
  'Probably': 1,
  'There': 1,
  'Yes.': 1,
  'a': 3,
  'about': 1,
  'accounting': 1,
  'and': 3,
  'are': 2,
  'be': 2,
  'bet': 1,
  'cat': 1,
  'cats,': 1,
  'charge': 1,
  'department.': 1,
  'describe': 1,
  'desk.': 1,
  'do': 1,
  'down.': 1,
  'downsizing': 1,
  'figurines': 1,
  'gonna': 2,
  'has': 1,
  'have': 1,
  'him.': 1,
  'how': 1,
  'in': 1,
  'is': 1,
  'it': 1,
  "it's": 1,
  'like': 1,
  'lot': 1,
  'me.': 1,
  'me.I': 1,
  'met': 1,
  'my': 1,
  'name': 1,
  'need': 1,
  'now.': 1,
  'of': 3,
  'old': 1,
  'on': 1,
  'one': 1,
  'push': 1,
  'really,': 1,
  'right': 1,
  'so': 1,
  'souls.': 1,
  'stapler.': 1,
  'stapler?': 1,
  'still': 1,
  'that': 1,
  'the': 2,
  'think': 2,
  'thinks': 1,
  'three': 1,
  'to': 1,
  'too.My': 1,
  'uh,': 2,
  'um,': 1,
  'um...Oscar,': 1,
  'us,': 1,
  'v

In [23]:
words = pd.DataFrame(d).fillna(0)
words

Unnamed: 0,!,!?!,!What,#1],#2,#5.',$0.50',"$1,000","$1,200","$1,200.",...,���they,���tiny,���we,���we're,���wells.���,���well���,���where,���you,���you���re,�����Cause
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
words_shuffled = words.sample(frac = 1)
train_set, test_set = train_test_split(words_shuffled, test_size = 0.2, train_size = 0.8)

In [25]:
X_train = train_set.drop(labels = ['CLASS'], axis = 1)
X_test = test_set.drop(labels = ['CLASS'], axis = 1)
y_train = train_set['CLASS']
y_test = test_set['CLASS']

X_train

Unnamed: 0,!,!?!,!What,#1],#2,#5.',$0.50',"$1,000","$1,200","$1,200.",...,���they,���tiny,���we,���we're,���wells.���,���well���,���where,���you,���you���re,�����Cause
1869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)

0.35729847494553379