In [88]:
import requests
from requests.exceptions import RequestException
import re
import io
import sys
import json
import pandas as pd




# Get website
def get_one_page(url):
	try:
		headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
		}
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			return response.text.replace('\n', '').replace('\t', '')
		return None
	except RequestException:
		return None


# Extract information and convert to dictioinary
def parse_one_page(html):
	pattern = re.compile(
		'chttp_tt_(.*?)\S>.*?href.*?/title/(.*?)/.*?title.*?>(.*?)</a>.*?>\((.*?)\)</span>.*?imdbRating.*?ratings\S\S(.*?)</strong>.*?"watchlistColumn">',
		re.S)
	items = re.findall(pattern, html)
	for item in items:
		yield {
			'Rank': item[0],
			'IMDbLink': item[1],
			'Name': item[2],
			'Release_Date': item[3],
			'Score': item[4]
		}


# Write down
def write_to_file(content):
	with open('imdb.txt', 'a', encoding='utf-8') as f:
		f.write(json.dumps(content, ensure_ascii=False) + '\n')


def write_xls(data):
	pf = pd.DataFrame(data)
	order = ['Rank', 'IMDbLink', 'Name', 'Release_Date', 'Score']
	pf = pf[order]
	columns_map = {
		'Rank': 'Rank',
		'IMDbLink':'IMDbLink',
		'Name': 'Name',
		'Release_Date': 'Release_Date',
		'Score': 'Score'
	}
	pf.rename(columns=columns_map, inplace=True)
	file_path = pd.ExcelWriter('IMDb250.xlsx')
	pf.fillna(' ', inplace=True)
	pf.to_excel(file_path, encoding='utf-8', index=False)
	file_path.save()


def main():
	url = 'http://www.imdb.com/chart/top?ref_=nv_mv_250_6'
	html = get_one_page(url)
	result = []
	for item in parse_one_page(html):
		result.append(item)
	write_xls(result)
		# write_to_file(item)


main()

In [89]:
df = pd.read_excel('IMDb250.xlsx')
df

Unnamed: 0,Rank,IMDbLink,Name,Release_Date,Score
0,1,tt0111161,The Shawshank Redemption,1994,9.2
1,2,tt0068646,The Godfather,1972,9.2
2,3,tt0468569,The Dark Knight,2008,9.0
3,4,tt0071562,The Godfather Part II,1974,9.0
4,5,tt0050083,12 Angry Men,1957,9.0
...,...,...,...,...,...
245,246,tt0071411,Dersu Uzala,1975,8.0
246,247,tt0103639,Aladdin,1992,8.0
247,248,tt1454029,The Help,2011,8.0
248,249,tt0083987,Gandhi,1982,8.0


In [90]:
df1 = pd.read_csv('imdb_top_250.csv')
df['overview'] = df1['Overview']
df

Unnamed: 0,Rank,IMDbLink,Name,Release_Date,Score,overview
0,1,tt0111161,The Shawshank Redemption,1994,9.2,Two imprisoned men bond over a number of years...
1,2,tt0068646,The Godfather,1972,9.2,An organized crime dynasty's aging patriarch t...
2,3,tt0468569,The Dark Knight,2008,9.0,When the menace known as the Joker wreaks havo...
3,4,tt0071562,The Godfather Part II,1974,9.0,The early life and career of Vito Corleone in ...
4,5,tt0050083,12 Angry Men,1957,9.0,A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...,...
245,246,tt0071411,Dersu Uzala,1975,8.0,"In order to power the city, monsters have to s..."
246,247,tt0103639,Aladdin,1992,8.0,Concurrent theatrical ending of the TV series ...
247,248,tt1454029,The Help,2011,8.0,The people of a small village in Victorian Ind...
248,249,tt0083987,Gandhi,1982,8.0,A boy who communicates with spirits seeks the ...


In [91]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

en_stops = set(stopwords.words('english'))
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [92]:
sentence_data = df['overview'][0]
tokenizer = RegexpTokenizer(r'\w+')
result = tokenizer.tokenize(sentence_data)
print(result)

['Two', 'imprisoned', 'men', 'bond', 'over', 'a', 'number', 'of', 'years', 'finding', 'solace', 'and', 'eventual', 'redemption', 'through', 'acts', 'of', 'common', 'decency']


In [93]:
[word for word in result if word not in en_stops]

['Two',
 'imprisoned',
 'men',
 'bond',
 'number',
 'years',
 'finding',
 'solace',
 'eventual',
 'redemption',
 'acts',
 'common',
 'decency']

In [94]:

def word_bag(value,w):
    sentence_data = value
    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(sentence_data)
    wordbag = [word for word in result if word not in en_stops]
    num = str(wordbag).count(w)
    list.append(num)
    

In [101]:
list = []
for x in range(250):
    value = df['overview'][x]
    w = 'men'
    word_bag(value,w)
df['men'] = list

In [102]:
list = []
for x in range(250):
    value = df['overview'][x]
    w = 'girl'
    word_bag(value,w)
df['girl'] = list

In [103]:
list = []
for x in range(250):
    value = df['overview'][x]
    w = 'dog'
    word_bag(value,w)
df['dog'] = list

In [104]:
list = []
for x in range(250):
    value = df['overview'][x]
    w = 'kill'
    word_bag(value,w)
df['kill'] = list

In [105]:
list = []
for x in range(250):
    value = df['overview'][x]
    w = 'love'
    word_bag(value,w)
df['love'] = list

In [106]:
df

Unnamed: 0,Rank,IMDbLink,Name,Release_Date,Score,overview,men,girl,dog,kill,love
0,1,tt0111161,The Shawshank Redemption,1994,9.2,Two imprisoned men bond over a number of years...,1,0,0,0,0
1,2,tt0068646,The Godfather,1972,9.2,An organized crime dynasty's aging patriarch t...,0,0,0,0,0
2,3,tt0468569,The Dark Knight,2008,9.0,When the menace known as the Joker wreaks havo...,1,0,0,0,0
3,4,tt0071562,The Godfather Part II,1974,9.0,The early life and career of Vito Corleone in ...,0,0,0,0,0
4,5,tt0050083,12 Angry Men,1957,9.0,A jury holdout attempts to prevent a miscarria...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
245,246,tt0071411,Dersu Uzala,1975,8.0,"In order to power the city, monsters have to s...",0,0,0,0,0
246,247,tt0103639,Aladdin,1992,8.0,Concurrent theatrical ending of the TV series ...,0,0,0,0,0
247,248,tt1454029,The Help,2011,8.0,The people of a small village in Victorian Ind...,0,0,0,0,0
248,249,tt0083987,Gandhi,1982,8.0,A boy who communicates with spirits seeks the ...,0,0,0,0,0


### Recommend The Godfather to User2 with item to item Collaborative Filtering

In [128]:
df = pd.read_excel('rating.xlsx',index_col= 'Name')
df

Unnamed: 0_level_0,Score,User1,User2,User3,User4,User5
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Shawshank Redemption,9.2,7,8.0,8.0,7.0,9.0
The Godfather,9.2,7,,7.0,9.0,
The Dark Knight,9.0,8,7.0,,6.0,8.0
The Godfather Part II,9.0,9,5.0,6.0,4.0,6.0
12 Angry Men,9.0,9,5.0,9.0,8.0,
Schindler's List,8.9,8,6.0,9.0,9.0,6.0
The Lord of the Rings: The Return of the King,8.9,9,9.0,10.0,9.0,9.0
Pulp Fiction,8.8,10,10.0,4.0,6.0,8.0
The Lord of the Rings: The Fellowship of the Ring,8.8,8,10.0,8.0,6.0,
"The Good, the Bad and the Ugly",8.8,8,8.0,8.0,,7.0


In [130]:
df.fillna(0, inplace=True)
df_1 = df.values
df_1

array([[ 9.2,  7. ,  8. ,  8. ,  7. ,  9. ],
       [ 9.2,  7. ,  0. ,  7. ,  9. ,  0. ],
       [ 9. ,  8. ,  7. ,  0. ,  6. ,  8. ],
       [ 9. ,  9. ,  5. ,  6. ,  4. ,  6. ],
       [ 9. ,  9. ,  5. ,  9. ,  8. ,  0. ],
       [ 8.9,  8. ,  6. ,  9. ,  9. ,  6. ],
       [ 8.9,  9. ,  9. , 10. ,  9. ,  9. ],
       [ 8.8, 10. , 10. ,  4. ,  6. ,  8. ],
       [ 8.8,  8. , 10. ,  8. ,  6. ,  0. ],
       [ 8.8,  8. ,  8. ,  8. ,  0. ,  7. ]])

In [131]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(df_1)
distances, indices = knn.kneighbors(df_1, n_neighbors=3)

In [132]:
indices

array([[0, 6, 5],
       [1, 4, 5],
       [2, 7, 3],
       [3, 0, 7],
       [4, 8, 1],
       [5, 6, 0],
       [6, 0, 5],
       [7, 2, 0],
       [8, 4, 5],
       [9, 3, 0]], dtype=int64)

In [133]:
distances

array([[0.        , 0.00687882, 0.02465749],
       [0.        , 0.05087719, 0.10614513],
       [0.        , 0.03206341, 0.08560607],
       [0.        , 0.03771823, 0.04174608],
       [0.        , 0.04599798, 0.05087719],
       [0.        , 0.01255192, 0.02465749],
       [0.        , 0.00687882, 0.01255192],
       [0.        , 0.03206341, 0.0396877 ],
       [0.        , 0.04599798, 0.08515285],
       [0.        , 0.04975594, 0.07132544]])

In [135]:
index_for_movie = df.index.tolist().index('The Godfather') # it returns 0
sim_movies = indices[index_for_movie].tolist() # make list for similar movies
movie_distances = distances[index_for_movie].tolist() # the list for distances of similar movies
id_movie = sim_movies.index(index_for_movie) # get the position of the movie itself in indices and distances
sim_movies.remove(index_for_movie) # remove the movie itself in indices
movie_distances.pop(id_movie) # remove the movie itself in distances

print('The Nearest Movies to The Godfather:', sim_movies)
print('The Distance from The God Father:', movie_distances)

The Nearest Movies to The Godfather: [4, 5]
The Distance from The God Father: [0.05087719147974168, 0.10614512607623006]


In [137]:
movie_similarity = [-x+1 for x in movie_distances] # inverse distance 

predicted_rating = (movie_similarity[0]*df.iloc[sim_movies[0],2] + movie_similarity[1]*df.iloc[sim_movies[1],2])/sum(movie_similarity)
print(predicted_rating)

5.485005804703181
