In [3]:
import urllib.request
import re
import requests
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import json
import time
from networkx.algorithms.community.centrality import girvan_newman
from fa2 import ForceAtlas2
import glob
import os
import nltk, pprint
from nltk import word_tokenize
from nltk.corpus import stopwords 
from nltk import FreqDist 
import wordcloud 
from wordcloud import WordCloud, STOPWORDS

In [30]:
def imdbId_to_wikiLink(imdbId):

    imdbId = str(imdbId)
    
    while len(imdbId)<7:
        imdbId = str(0) + imdbId  

    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT ?wppage WHERE {                                                          
    ?subject wdt:P345 'tt''' + imdbId + '''' .                                                   
      ?wppage schema:about ?subject .                                               
      FILTER(contains(str(?wppage),'//en.wikipedia'))                               
    }
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query})
    while (r.status_code == 429):
        time.sleep(0.1)
        r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    if (len(data['results']['bindings']) == 0):
        return None
        
    weblink = data['results']['bindings'][0]['wppage']['value']

    
    wikilink = ''
    
    #match the last /wikilink part of url
    regex = re.compile(r".+\/(.+)")
    wikimatch = re.search(regex, weblink)
    
    if wikimatch is not None:
        wikilink = wikimatch.group(1)
    
    return wikilink

In [31]:
def wikiLink_to_wikiText(wikiLink):

    #as described at https://www.mediawiki.org/wiki/API:Tutorial
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    
    title = "titles={}".format(wikilink)
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    
    return wikitext

In [32]:
def genre_in_common(movie1, movie2):
    
    #define the to list of genres
    G1 = G.nodes[movie1]['genres']
    G2 = G.nodes[movie2]['genres']
    
    #make list of genres in common
    common = [i for i in G1 if i in G2]
    
    if (len(common)>0):
        return True
    return False

In [33]:
def suggestMovie(movieId):
    
    #used to keep track of strongest link to other movie
    max_weight = 0
    
    for edge in G.edges(movieId):
        
        #if link is the strongest we've seen, then save it
        if G[edge[0]][edge[1]]['weight'] > max_weight:
            max_weight = G[edge[0]][edge[1]]['weight']
            recommended_edge = edge
    
    #return the recommended movie from the strongest link
    if recommended_edge[0] != movieId:
        recommended_movie = recommended_edge[0]
    else:
        recommended_movie = recommended_edge[1]
        
    return G.nodes[recommended_movie]['title']  

In [34]:
def get_yearly_avg(year):
    
    #used to calculate average
    count = 0
    sum = 0
    
    for movie in G.nodes:
        
        #match numbers in ()
        match = re.search(r"\(([0-9]*?)\)", G.nodes[movie]['title'])
        
        if (match is not None):
            
            #if movie is from year, update count and sum
            if (int(match.group(1)) == year):
                count += 1
                sum += G.nodes[movie]['avg_rating']
                
    if (count == 0):
        return 0
    return (sum/count)

In [35]:
def movieId_to_filetitle(movieId):
    
    illegal = ['#','%','&','{','}','\\','$','!','\'','"',':','<','>','*','?','/',' ','+','`','|','=']

    fileTitle = G.nodes[movieId]['title'] + '.txt'
    
    for sign in illegal:
        fileTitle = fileTitle.replace(sign,'_')
    return fileTitle

In [36]:
readerLinks = pd.read_csv('links.csv')
readerMovies = pd.read_csv('movies.csv')
readerRatings = pd.read_csv('ratings.csv')
#readerRatings =pd.read_csv(url)


In [40]:
links = []
for i in range(len(readerLinks)):

    movieId = readerLinks.iloc[i,0]
    imdbId = readerLinks.iloc[i,1]
    
    links.append([movieId, imdbId])
movies = []
for i in range(len(readerMovies)):

    movieId = readerMovies.iloc[i,0]
    titel = readerMovies.iloc[i,1]
    genres = readerMovies.iloc[i,2]
    
    movies.append([movieId, titel, genres])
ratings = []
for i in range(len(readerRatings)):

    userId = readerRatings.iloc[i,0]
    movieId = readerRatings.iloc[i,1]
    rating = readerRatings.iloc[i,2]
    
    ratings.append([userId, movieId, rating])
G = nx.Graph()
    


In [42]:
print(movies[2])

['3', 'Grumpier Old Men (1995)', 'Comedy|Romance']


In [None]:
for movie in movies:
    print(movie[2])
    #make list of genres from string of genres
    genres = re.findall(r"([^|]+)", movie[2])
    
    G.add_node(movie[0], title=movie[1], avg_rating=0, rating_count=0, genres=genres,)

In [None]:

limit_for_love = 4

#for every movie loved by user, make a link/increase weight of existing link, to movies also loved by user
for i in range(len(ratings)):
    
    #defining movie
    movie = ratings[i][1]
    
    #update average rating of movie
    old_average = G.nodes[movie]['avg_rating']
    old_count = G.nodes[movie]['rating_count']
    
    G.nodes[movie]['rating_count'] += 1
    G.nodes[movie]['avg_rating'] = (old_average*old_count + ratings[i][2])/G.nodes[movie]['rating_count']
    
    #if user doesnt love movie dont bother linking to it
    if (ratings[i][2] < limit_for_love):
        continue
    
    #increase index to avoid linking to itself
    j = i+1
    
    #avoid out of range
    if(j < len(ratings)):
        
        #look at all other ratings made by same user
        while (ratings[j][0] == ratings[i][0]):
            
            second_movie = ratings[j][1]
            
            #if the user loves second_movie and the two movies share a genre, add an edge between them or increase weight of existing
            if (ratings[j][2] >= limit_for_love and genre_in_common(movie, second_movie)):
                if (G.has_edge(movie, second_movie)):
                    G[movie][second_movie]['weight'] += 1 #increase weight on existing edge
                else:
                    G.add_edge(movie, ratings[j][1], weight=1) #or create new edge
            j+=1
            if (j>=len(ratings)):
                break

In [None]:
x = []
y = []
for i in range(51):
    loved = 0
    for node in G.nodes:
        if (G.nodes[node]['avg_rating'] >= limit_for_love and G.nodes[node]['rating_count'] > i):
            loved += 1
    x.append(i)
    y.append(loved)

plt.plot(x, y)
  
plt.xlabel('minimum ratings')
plt.ylabel('count of loved movies')
  
plt.title('How many ratings does a loved movie have?')
  
plt.show()

In [None]:
#weakly_connected_components only works on undirected graphs
G_directed = G.to_directed()
Gsub = max(nx.weakly_connected_components(G_directed))
Gsub = nx.subgraph(G, Gsub)
Gsub = Gsub.to_undirected()

In [None]:
defined_colors = []

red = "#A83432"
blue = "#323EA8"

for node in Gsub:
    #color movies liked on average red
    if G.nodes[node]['avg_rating'] >= limit_for_love and G.nodes[node]['rating_count'] > 20:
        defined_colors.append(red)
    #color the rest blue
    else:
        defined_colors.append(blue)
forceatlas2 = ForceAtlas2(gravity = 1.0)
positions = forceatlas2.forceatlas2_networkx_layout(Gsub, pos=None, iterations= 100)

In [None]:
fig = plt.figure(1, figsize=(200, 100), dpi=60)
nodes = nx.draw_networkx_nodes(Gsub, positions, alpha=1, node_color = defined_colors)
nx.draw_networkx_edges(Gsub, positions, alpha=0.5)

# PART B TF-IDF

For each given category, the movies corresponding the wikitext are combined into one document. 


In [12]:
import os 
import string 

def get_plot(file):   
    # We only analyse the plot of the movie. 
    
    if file.find("==Plot==") is not -1 and file.find("==Cast==") is not -1 :
        
        min = file.find("==Plot==")
        max = file.find("==Cast==")
        plot_file = file[min:max]
            
    elif file.find("References") is not -1:
        
        max = file.find("References")
        plot_file = file[1000:max]
           
    else: 
         
        plot_file = file
    
    return plot_file
    
                    
def cleaning_text(raw):
    
    words = nltk.wordpunct_tokenize(raw)
    

    nonPunct = re.compile('.*[A-Za-z0-9].*') # must contain a letter or digit
    filter  = [w for w in words if nonPunct.match(w)]
    
    #import enchant

    #d = enchant.Dict("en_US")

    words = [word.lower() for word in filter]
    

    #words = [i for i in words if len(i) > 1]
            
    
    return words
    
def remove_unwanted(words):
     
    stops= stopwords.words('english')
    months = ['january','february','march','april','may','june','july','august','september','october','november']
    Extra_stops = ['plot', 'nin', 'ref','url','www','http','https','com','jsonline','df','n','cite','web','html','mdy','name','article','title','date','webcitation','acces']
    Extra = ['webb','film','nmark','critic','critics','publisher','director','editor','deschanel','awards','aa','list','plainlist']
    unwanted = stops  + Extra_stops + months + Extra
    
    
    #final  = [word for word in words if word.lower() not in stops]
    final = []
    for word in words:
        if word not in unwanted:
           final.append(word)
           
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()]) # removing year and such
    
    while "  " in final:
        
        final = final.replace("  ", " ")
    
    return final 

     


In [41]:
stops= stopwords.words('english')
months = ['january','february','march','april','may','june','july','august','september','october','november']
Extra_stops = ['plot', 'nin', 'ref','url','www','http','https','com','jsonline','df','n','cite','web','html','mdy','name','article','title','date','webcitation','access']
stops = stops + Extra_stops + months 

print(Documents)

 


In [148]:
import os 
Documents = [None]*3
Document = []

# iterate over files in
Wikitext = os.listdir(os.getcwd() + '\\Wikitexts')
count = 0
for text in Wikitext:
   if count > 13:
      break
   else:
      
      if count <= 4:
         with open(os.getcwd() + '\\Wikitexts\\' + text) as f:
            file = f.read()
            file = get_plot(file)
            file_word = remove_unwanted(cleaning_text(file))
            
            #Documents[0] = Documents[0] + ' ' + str(file_word)
            #Documents[0] = file_word
            Document.append(file_word)
            
      elif 4<count<=8:
         with open(os.getcwd() + '\\Wikitexts\\' + text) as f:
            file = f.read()
            file = get_plot(file)
            file_word = remove_unwanted(cleaning_text(file))
            #Documents[1] = Documents[1] + ' ' +str(file_word)
            Documents[1] = file_word
            Document.append(file_word)
           
      elif 8<count<=12:
         with open(os.getcwd() + '\\Wikitexts\\' + text) as f:
            file = f.read()
            file = get_plot(file)
            file_word = remove_unwanted(cleaning_text(file))
            #Documents[2].append(file_word)
            Documents[2] = file_word
            Document.append(str(file_word))
            
            #doc2 = doc2 + ' ' + file
   count = count+1
        

In [13]:
import os 
Documents = ['','','','','','','','','','','','','']
Document = []

# iterate over files in
Wikitext = os.listdir(os.getcwd() + '\\Wikitexts')
count = 0
for text in Wikitext:
      
   
      if count < 13:
         with open(os.getcwd() + '\\Wikitexts\\' + text) as f:
            file = f.read()
            file = get_plot(file)
            file_word = remove_unwanted(cleaning_text(file))
            
            Documents[count] = Documents[count] + ' ' + file_word
            #Documents[0] = file_word
            #Document.append(file_word)
            
     
            #doc2 = doc2 + ' ' + file
      count = count+1

In [21]:
h = 'Hello dark and brown wwwcourse the I lived 2065. you live in words legged-skin and soft www'

print(h)
print(remove_unwanted(cleaning_text(h)))


Hello dark and brownwwwcourse the I lived 2065. you live in words legged-skin and soft www
hello dark brownwwwcourse lived  live words legged skin soft


In [28]:
from bs4 import BeautifulSoup
import bleach
import requests

with open("C:/Users/Mabel Ifeoma/Socialgraphs/Wikitexts/Zookeeper_(2011).txt") as f:
    test = f.read()

#test
#print(test.find('plot'))

with open("C:/Users/Mabel Ifeoma/Socialgraphs/Wikitexts/Young_Adult_(2011).txt") as f:
    test2 = f.read()
    
min = test.find("==Plot==")
max = test.find("==Cast==")


test = test[min:max]
test = remove_unwanted(cleaning_text(test))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [29]:
# We use function to create the TF-IDF vectors
#calling the TfidfVectorizer

stops= stopwords.words('english')
months = ['january','february','march','april','may','june','july','august','september','october','november']
Extra_stops = ['plot', 'nin', 'ref','url','www','http','https','com','jsonline','df','n','cite','web','html','mdy','name','article','title','date','webcitation','access']
Extra = ['webb','film','nmark','critic','critics','publisher','director','editor','deschanel','awards','aa']
unwanted = stops  + Extra_stops + months + Extra

vectorize= TfidfVectorizer(stop_words=unwanted, ngram_range= (1,1)) 
response= vectorize.fit_transform(Documents)

feuture_names = vectorize.get_feature_names_out()




ValueError: Iterable over raw text documents expected, string object received.

In [None]:
import enchant
d = enchant.Dict("en_US")
from nltk.corpus import words
dense = response.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
       
        if word > 0 :
                
            
            keywords.append(feuture_names[x])
        x=x+1
    all_keywords.append(keywords)


#print (Documents[0])
print (all_keywords)


In [22]:


true_k = 8

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(response[0])

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorize.get_feature_names_out()

with open ("C:/Users/Mabel Ifeoma/Socialgraphs/trial_result.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :15]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")


    

KeyboardInterrupt: 