In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

In [2]:
df_perf_all = pd.read_csv(r'C:\Users\zhouhang\Desktop\imdb_top_1000.csv', sep=",", encoding = 'ansi')
df_perf_all.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
df_perf_all.shape

(1000, 16)

In [4]:
df_perf_with_description = df_perf_all[df_perf_all['Overview'].notnull()]
df_perf_with_description = df_perf_with_description[~df_perf_with_description['Overview'].str.isspace()]

In [5]:
name = df_perf_with_description['Series_Title'].values
name[0:6]

array(['The Shawshank Redemption', 'The Godfather', 'The Dark Knight',
       'The Godfather: Part II', '12 Angry Men',
       'The Lord of the Rings: The Return of the King'], dtype=object)

In [6]:
notes = df_perf_with_description['Series_Title'].values
notes[0:5]

array(['The Shawshank Redemption', 'The Godfather', 'The Dark Knight',
       'The Godfather: Part II', '12 Angry Men'], dtype=object)

In [7]:
descr = df_perf_with_description['Overview'].values
descr[0:3]

array(['Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
       "An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.",
       'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.'],
      dtype=object)

In [8]:
tfidfv = TfidfVectorizer()
descr_matrix = tfidfv.fit_transform(descr)
descr_matrix

<1000x5648 sparse matrix of type '<class 'numpy.float64'>'
	with 21677 stored elements in Compressed Sparse Row format>

In [9]:
countv = CountVectorizer()
descr_matrix_co = countv.fit_transform(descr)
descr_matrix_co

<1000x5648 sparse matrix of type '<class 'numpy.int64'>'
	with 21677 stored elements in Compressed Sparse Row format>

In [10]:
class SimpleKNNRecommender:
    
    def __init__(self, X_matrix, X_names, X_notes, X_descr):
        """
        Входные параметры:
        X_matrix - обучающая выборка (матрица объект-признак)
        X_ids - массив идентификаторов объектов
        X_title - массив названий объектов
        X_overview - массив описаний объектов
        """
        #Сохраняем параметры в переменных объекта
        self._X_matrix = X_matrix
        self.df = pd.DataFrame(
            {'Perfume Name': pd.Series(X_names, dtype='str'),
            'Notes': pd.Series(X_notes, dtype='str'),
            'Description': pd.Series(X_descr, dtype='str'),
            'Dist': pd.Series([], dtype='float')})
            
            
    def recommend_for_single_object(self, K: int, \
                X_matrix_object, cos_flag = True, manh_flag = False):
        """
        Метод формирования рекомендаций для одного объекта.
        Входные параметры:
        K - количество рекомендуемых соседей 
        X_matrix_object - строка матрицы объект-признак, соответствующая объекту
        cos_flag - флаг вычисления косинусного расстояния
        manh_flag - флаг вычисления манхэттэнского расстояния
        Возвращаемое значение: K найденных соседей
        """
        
        scale = 1000000
        # Вычисляем косинусную близость
        if cos_flag:
            dist = cosine_similarity(self._X_matrix, X_matrix_object)
            self.df['Dist'] = dist * scale
            res = self.df.sort_values(by='Dist', ascending=False)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['Dist'] < scale]
        
        else:
            if manh_flag:
                dist = manhattan_distances(self._X_matrix, X_matrix_object)
            else:
                dist = euclidean_distances(self._X_matrix, X_matrix_object)
            self.df['Dist'] = dist * scale
            res = self.df.sort_values(by='Dist', ascending=True)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['Dist'] > 0.0]            
        
        # Оставляем К первых рекомендаций
        res = res.head(K)
        return res

In [12]:
test_perfume_name = 100
name[test_perfume_name]

'Bacheha-Ye aseman'

In [13]:
test_perfume_matrix = descr_matrix[test_perfume_name]
test_perfume_matrix

<1x5648 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [14]:
skr1 = SimpleKNNRecommender(descr_matrix, name, notes, descr)

In [15]:
test = df_perf_with_description.iloc[test_perfume_name]
test

Poster_Link      https://m.media-amazon.com/images/M/MV5BZTYwZW...
Series_Title                                     Bacheha-Ye aseman
Released_Year                                                 1997
Certificate                                                     PG
Runtime                                                     89 min
Genre                                         Drama, Family, Sport
IMDB_Rating                                                    8.3
Overview         After a boy loses his sister's pair of shoes, ...
Meta_score                                                    77.0
Director                                              Majid Majidi
Star1                                           Mohammad Amir Naji
Star2                                       Amir Farrokh Hashemian
Star3                                               Bahare Seddiqi
Star4                                       Nafise Jafar-Mohammadi
No_of_Votes                                                  6

In [16]:
rec1 = skr1.recommend_for_single_object(15, test_perfume_matrix)
rec1

Unnamed: 0,Perfume Name,Notes,Description,Dist
598,Moonrise Kingdom,Moonrise Kingdom,A pair of young lovers flee their New England ...,251966.634008
121,Ikiru,Ikiru,A bureaucrat tries to find a meaning in his li...,233591.757218
973,Home Alone,Home Alone,An eight-year-old troublemaker must protect hi...,184947.952067
878,Searching,Searching,"After his teenage daughter goes missing, a des...",181669.275204
437,Anatomy of a Murder,Anatomy of a Murder,"In a murder trial, the defendant says he suffe...",168386.310442
207,PK,PK,An alien on Earth loses the only device he can...,163985.222208
329,The Martian,The Martian,An astronaut becomes stranded on Mars after hi...,161850.93867
942,The Butterfly Effect,The Butterfly Effect,Evan Treborn suffers blackouts during signific...,151645.81681
521,Trois couleurs: Bleu,Trois couleurs: Bleu,A woman struggles to find a way to live her li...,148541.441886
6,Pulp Fiction,Pulp Fiction,"The lives of two mob hitmen, a boxer, a gangst...",148225.423635


In [17]:
test_perfume_matrix_co = descr_matrix_co[test_perfume_name]
test_perfume_matrix_co

<1x5648 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [18]:
skr2 = SimpleKNNRecommender(descr_matrix_co, name, notes, descr)

In [19]:
rec2 = skr2.recommend_for_single_object(15, test_perfume_matrix_co, cos_flag = False)
rec2

Unnamed: 0,Perfume Name,Notes,Description,Dist
121,Ikiru,Ikiru,A bureaucrat tries to find a meaning in his li...,5916080.0
598,Moonrise Kingdom,Moonrise Kingdom,A pair of young lovers flee their New England ...,6480741.0
878,Searching,Searching,"After his teenage daughter goes missing, a des...",6557439.0
381,Vizontele,Vizontele,Lives of residents in a small Anatolian villag...,6633250.0
187,Dial M for Murder,Dial M for Murder,A former tennis player tries to arrange his wi...,6633250.0
619,Auf der anderen Seite,Auf der anderen Seite,A Turkish man travels to Istanbul to find the ...,6633250.0
674,Dip huet seung hung,Dip huet seung hung,A disillusioned assassin accepts one last hit ...,6633250.0
876,The Invisible Man,The Invisible Man,"A scientist finds a way of becoming invisible,...",6633250.0
360,The Pursuit of Happyness,The Pursuit of Happyness,A struggling salesman takes custody of his son...,6633250.0
571,Sound of Metal,Sound of Metal,A heavy-metal drummer's life is thrown into fr...,6708204.0
