# Topic Mining 1
**Note**: 
This part of codes count directors, writers, casts names in each movies' posts.

**Data Required**:
imdb_scraped.csv, fbposts.csv

In [404]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import gensim
import nltk

In [385]:
# define the function of text cleaning
def remove_special_characters(text,remove_digits=False):
    pattern=r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text=re.sub(pattern,'',text)
    return text

def normalize(articles):
    n_articles=[]
    for article in articles:
        article=remove_special_characters(article,True)
        tokens = [token.strip() for token in nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(article)]
        # word length>2
        tokens = [token for token in tokens if len(token) > 2]
        tokens = list(filter(None, tokens))
        # if tokens:
        n_articles.append(tokens)
    return n_articles

In [386]:
# read data scraped from imdb using automated crawler
data1 = pd.read_csv('imdb_scraped.csv')

In [387]:
data2 = pd.read_csv('fbposts.csv')

In [388]:
# group each movies' posts into one, and use this as unit of analysis
data2 = data2.groupby(['imdb_id'])['message_and_description'].apply(lambda x: "%s" % ' '.join(x)).to_frame('posts').reset_index()

In [389]:
# filter out movies in imdb_scraped that do not have posts
data1 = data1.loc[data1['imdb_id'].isin(data2['imdb_id'])]

In [390]:
director = list(data1.director)
writer = list(data1.writer)
star = list(data1.star)

In [391]:
posts = data2.posts

In [392]:
posts = normalize(posts)

In [393]:
# use bigram to identify names
def preprocess1(content):
    bigram = gensim.models.Phrases(content, min_count=5, threshold=10,
                                   delimiter='_') 
    bigram_model = gensim.models.phrases.Phraser(bigram)
    doc_bigrams = [bigram_model[doc] for doc in content]
    return doc_bigrams

In [394]:
# separate two names
def insertspace(columnname):
    columnname = [str(x) for x in columnname]
    columnname = [re.sub(r"(\w|\))([A-Z,É])", r"\1_\2", name) for name in columnname]
    return columnname

In [395]:
# convert names into lists
def preprocess2(columnname):
    cn1 = insertspace(columnname)
    d1 = [re.sub('Mc_','Mc',word) for word in cn1]
    d1 = [re.sub('Mac_','Mac',word) for word in d1]
    d1 = [re.sub('R_','R',word) for word in d1]
    d1 = [re.sub('\(.*?\)','',word) for word in d1]
    d2 = []
    for name in d1:
        name1 = name.split('_')
        d2.append(name1)
    d4 = []
    for rows in d2:
        d3 = []
        for name in rows:
            name1 = re.sub(' ','_',name)
            d3.append(name1)
        d4.append(d3)
    return d4

In [396]:
director = preprocess2(director)
writer = preprocess2(writer)
star = preprocess2(star)

In [397]:
posts = preprocess1(posts)

In [398]:
counts_d = []
counts_w = []
counts_s = []

In [399]:
# count how many times directors names appear in posts
for doc in posts:
    count = 0
    index = posts.index(doc)
    for name in director[index]:
        count += doc.count(name)
    counts_d.append(count)

In [400]:
# count how many times writers names appear in posts
for doc in posts:
    count = 0
    index = posts.index(doc)
    for name in writer[index]:
        count += doc.count(name)
    counts_w.append(count)

In [401]:
# count how many times stars names appear in posts
for doc in posts:
    count = 0
    index = posts.index(doc)
    for name in star[index]:
        count += doc.count(name)
    counts_s.append(count)

In [402]:
data2['dircount'] = counts_d
data2['wricount'] = counts_w
data2['strcount'] = counts_s

In [403]:
data2.to_csv('count3.csv')