# Topic Mining 2
**Note**: 
This part of codes calculate the similarity between posts content and movie overviews.

**Data Required**:
imdb_scraped.csv, fbposts.csv

In [93]:
import pandas as pd
import numpy as np
import re
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
data1 = pd.read_csv('imdb_scraped.csv')

In [95]:
data2 = pd.read_csv('fbposts.csv')

In [96]:
# join two tables using the key 'imdb_id'.
data2 = data2.join(data1.set_index('imdb_id'), on='imdb_id')

In [97]:
# extract useful columns
data2 = data2[['imdb_id','message_and_description','outline']]

In [98]:
data2

Unnamed: 0,imdb_id,message_and_description,outline
0,tt0485985,Ne-Yo: I don't want to play singers - Irish I...,A crew of African American pilots in the Tuske...
1,tt0485985,Ne-Yo: I don't want to play singers - Belfast...,A crew of African American pilots in the Tuske...
2,tt0485985,Ne-Yo: I like roles that have little or nothi...,A crew of African American pilots in the Tuske...
3,tt0485985,Film sheds light on plight of Lakotas - San F...,A crew of African American pilots in the Tuske...
4,tt0485985,Disney buying Lucasfilm for $4.05B - WDTN WDT...,A crew of African American pilots in the Tuske...
...,...,...,...
48264,tt1656190,The Wall Street Journal says “Warrior is that ...,"Mei, a young girl whose memory holds a pricele..."
48265,tt1656190,,"Mei, a young girl whose memory holds a pricele..."
48266,tt1656190,,"Mei, a young girl whose memory holds a pricele..."
48267,tt1656190,Check it out! IGN.com exclusively debuts the f...,"Mei, a young girl whose memory holds a pricele..."


In [99]:
posts = data2.message_and_description.tolist()

In [100]:
outline = data2.outline.tolist()

In [101]:
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return nltk.word_tokenize(text.lower().translate(remove_punctuation_map))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

In [102]:
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [103]:
results = []

In [104]:
for i in range(0,48269):
    result = cosine_sim(posts[i],outline[i])
    results.append(result)

In [105]:
data2['score']=results

In [106]:
data2

Unnamed: 0,imdb_id,message_and_description,outline,score
0,tt0485985,Ne-Yo: I don't want to play singers - Irish I...,A crew of African American pilots in the Tuske...,0.046990
1,tt0485985,Ne-Yo: I don't want to play singers - Belfast...,A crew of African American pilots in the Tuske...,0.046990
2,tt0485985,Ne-Yo: I like roles that have little or nothi...,A crew of African American pilots in the Tuske...,0.084405
3,tt0485985,Film sheds light on plight of Lakotas - San F...,A crew of African American pilots in the Tuske...,0.000000
4,tt0485985,Disney buying Lucasfilm for $4.05B - WDTN WDT...,A crew of African American pilots in the Tuske...,0.079590
...,...,...,...,...
48264,tt1656190,The Wall Street Journal says “Warrior is that ...,"Mei, a young girl whose memory holds a pricele...",0.000000
48265,tt1656190,,"Mei, a young girl whose memory holds a pricele...",0.000000
48266,tt1656190,,"Mei, a young girl whose memory holds a pricele...",0.000000
48267,tt1656190,Check it out! IGN.com exclusively debuts the f...,"Mei, a young girl whose memory holds a pricele...",0.000000


In [107]:
data3 = data2.groupby('imdb_id',as_index=False)['score'].sum()

In [108]:
data3.to_csv('score.csv')