In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import reduce

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Berkay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Berkay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv("imbd.csv", header=None)

In [4]:
col_names = ["Movie Name" ,"Movie Category","Movie General Rate","Year" ,"Score" ,"Age Limit" ,"User Name" ,"Comment","den"]

In [5]:
df.columns = col_names

In [6]:
df = df.rename(columns = {"Comment" : "Comment Title" , "den" : "Comment"})

In [7]:
df = df.drop( columns=["Movie General Rate" , "Movie Category" , "Year" , "Score" , "Age Limit", "Comment Title"] ,axis=1)
df.head()

Unnamed: 0,Movie Name,User Name,Comment
0,Esaretin Bedeli,Sleepin_Dragon,It is no wonder that the film has such a high ...
1,Esaretin Bedeli,EyeDunno,I'm trying to save you money; this is the last...
2,Esaretin Bedeli,alexkolokotronis,This movie is not your ordinary Hollywood flic...
3,Esaretin Bedeli,auuwws,The best movie in history and the best ending ...
4,Esaretin Bedeli,Coxer99,One of the finest films made in recent years. ...


In [8]:
df = df.drop("User Name" , axis=1)
df.head()

Unnamed: 0,Movie Name,Comment
0,Esaretin Bedeli,It is no wonder that the film has such a high ...
1,Esaretin Bedeli,I'm trying to save you money; this is the last...
2,Esaretin Bedeli,This movie is not your ordinary Hollywood flic...
3,Esaretin Bedeli,The best movie in history and the best ending ...
4,Esaretin Bedeli,One of the finest films made in recent years. ...


In [9]:
#STOP WORDS 
stop_words = set(stopwords.words('english'))

In [10]:

#BU KODDA TOKENİZE İŞLEMİ, STOP-WORDS ÇIKARTMA VE NUMERIC DEĞERLERİN ÇIKARTILMA İŞLEMİ YAPILMIŞTIR

def preprocess_text(text):
    word_tokens = word_tokenize(text.lower())  
    filtered_text = [word for word in word_tokens if word.isalnum() and word not in stop_words]  
    return filtered_text

In [11]:
# YORUMLARIN TOKENİZE EDİLMİŞ HALİNİ GÖSTERİR
df['Tokenized Comment'] = df['Comment'].apply(preprocess_text)

print(df[['Movie Name', 'Tokenized Comment']].head())


        Movie Name                                  Tokenized Comment
0  Esaretin Bedeli  [wonder, film, high, rating, quite, literally,...
1  Esaretin Bedeli  [trying, save, money, last, film, title, consi...
2  Esaretin Bedeli  [movie, ordinary, hollywood, flick, great, dee...
3  Esaretin Bedeli  [best, movie, history, best, ending, entertain...
4  Esaretin Bedeli  [one, finest, films, made, recent, years, poig...


In [12]:
#FİLMLER ARASINDAKİ ORTAK KELİMELERİ YAZDIRIR

tokenized_comments_per_movie = df.groupby('Movie Name')['Tokenized Comment'].apply(lambda x: reduce(lambda a, b: a + b, x)).to_dict()

common_words = set(tokenized_comments_per_movie[next(iter(tokenized_comments_per_movie))])
for movie_name, tokenized_comments in tokenized_comments_per_movie.items():
    common_words = common_words.intersection(set(tokenized_comments))

print("Tüm filmler arasındaki ortak kelimeler:")
for word in common_words:
    print(word)


Tüm filmler arasındaki ortak kelimeler:
make
like
made
one
long
would
times
still
even
say
first
much
life
get
never
well
also
story
ever
time
could
think
people
every
film
better
best
everything
two
good
films
great
movie
movies
character
way
something
really
watch
though
see
characters
many
makes


In [13]:
#ORTAK KELİMELERİ İÇEREN DATAFRAME OLUŞTURUR

common_words_list = list(common_words)

common_words_df = pd.DataFrame(common_words_list, columns=['Common Words'])

print(common_words_df.head())


  Common Words
0         make
1         like
2         made
3          one
4         long


In [14]:
#TF-IDF'LERİN MATRİSİNİ OLUŞTURUR VE DATA FRAME'E ATAR

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Comment'])
#print(tfidf_matrix)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())



#ORTAK KELİMELERİN TD-İDF DEĞERLERİNİ YAZDIRIR
common_words_tfidf = {}
for word in common_words:
    if word in tfidf_df.columns:
        tfidf_value = tfidf_df[word].mean()  
        common_words_tfidf[word] = tfidf_value

common_words_tfidf_df = pd.DataFrame.from_dict(common_words_tfidf, orient='index', columns=['TF-IDF'])

#print(common_words_tfidf_df)

In [15]:
# TF-IDF DEĞERLERİNİ AZALAN SIRAYA GÖRE SIRALAR
common_words_tfidf_df = common_words_tfidf_df.sort_values(by='TF-IDF', ascending=False)

print(common_words_tfidf_df)


              TF-IDF
film        0.048534
movie       0.043447
one         0.029122
best        0.022489
like        0.019597
great       0.019256
story       0.018800
time        0.018708
good        0.016572
well        0.015739
see         0.015358
ever        0.015029
also        0.014595
really      0.014321
even        0.014246
films       0.013849
first       0.013717
movies      0.013666
much        0.013466
many        0.013158
character   0.013008
characters  0.012966
watch       0.012872
made        0.012716
would       0.012591
way         0.012433
life        0.012146
people      0.011995
think       0.011693
never       0.011280
still       0.010967
could       0.010634
get         0.010474
every       0.010385
make        0.009910
two         0.009452
say         0.009124
long        0.008698
something   0.008689
better      0.008659
makes       0.008601
times       0.008058
everything  0.007579
though      0.007180


In [16]:
# KOMBİNASYON OLUŞTURURUR
all_combinations = []

for r in range(1, 5): 
    combinations = itertools.combinations(common_words_tfidf_df.index, r)
    all_combinations.extend(combinations)

#for combination in all_combinations:
    #print(combination)


In [17]:
#KOMBİNASYONLARIN TF-IDF DEĞERLERİNİ HESAPLAR
combination_tfidf = {}

combination_count = 0

for combination in all_combinations:
    tfidf_sum = sum(common_words_tfidf_df.loc[word, 'TF-IDF'] for word in combination)
    tfidf_avg = tfidf_sum / len(combination)
    combination_tfidf[combination] = tfidf_avg
    combination_count += 1

#for combination, tfidf in combination_tfidf.items():
    #print(f"Kombinasyon: {combination}, TF-IDF Değeri: {tfidf}")

print(f"Toplam kombinasyon sayısı: {combination_count}")


Toplam kombinasyon sayısı: 149985


In [18]:
# DEĞERLERİ DATA FRAME E ATAR
combination_tfidf_df = pd.DataFrame(list(combination_tfidf.items()), columns=['Combination', 'TF-IDF'])

print(combination_tfidf_df.head())


  Combination    TF-IDF
0     (film,)  0.048534
1    (movie,)  0.043447
2      (one,)  0.029122
3     (best,)  0.022489
4     (like,)  0.019597


In [19]:
#BÜTÜN KOMBİNASYONLARI AZALAN SIRAYA GÖRE YAZDIRIR
sorted_combinations = sorted(combination_tfidf.items(), key=lambda x: x[1], reverse=True)

#for combination, tfidf in sorted_combinations:
    #print(f"Kombinasyon: {combination}, TF-IDF Değeri: {tfidf}")

print(f"Toplam kombinasyon sayısı: {combination_count}")


Toplam kombinasyon sayısı: 149985


In [20]:
# TF-IDF DEĞERLERİNE GÖRE KOMBİNASYONLARDAN 0.03 OLANLARI "sorted_combinations" GÖRE YAZDIRIR:
filtered_combinations = [(combination, tfidf) for combination, tfidf in sorted_combinations if tfidf >= 0.03]

#for combination, tfidf in filtered_combinations:
    #print(f"Kombinasyon: {combination}, TF-IDF Değeri: {tfidf}")

print(f"Filtrelenmiş kombinasyon sayısı: {len(filtered_combinations)}")


Filtrelenmiş kombinasyon sayısı: 382


In [21]:
#FİLTRELENMİŞ KOMBİNASYONLAR DATAFRAME'E AKTARILIR:
import pandas as pd
filtered_combinations_df = pd.DataFrame(filtered_combinations, columns=['Combination', 'TF-IDF'])
print(filtered_combinations_df)


                        Combination    TF-IDF
0                           (film,)  0.048534
1                     (film, movie)  0.045990
2                          (movie,)  0.043447
3                (film, movie, one)  0.040368
4                       (film, one)  0.038828
..                              ...       ...
377        (film, movie, see, made)  0.030014
378       (film, movie, also, much)  0.030010
379                   (movie, good)  0.030009
380    (film, movie, really, first)  0.030005
381  (film, movie, ever, character)  0.030005

[382 rows x 2 columns]


In [22]:
# FİLTRELENMİŞ KOMBİNASYONLARIN SÜTUNLARININ İŞLEM KOLAYLIĞI AÇISINDAN YER DEĞİŞTİRİLMESİ İŞLEMİ:
filtered_combinations_df = filtered_combinations_df[['TF-IDF', 'Combination']]
print(filtered_combinations_df)


       TF-IDF                     Combination
0    0.048534                         (film,)
1    0.045990                   (film, movie)
2    0.043447                        (movie,)
3    0.040368              (film, movie, one)
4    0.038828                     (film, one)
..        ...                             ...
377  0.030014        (film, movie, see, made)
378  0.030010       (film, movie, also, much)
379  0.030009                   (movie, good)
380  0.030005    (film, movie, really, first)
381  0.030005  (film, movie, ever, character)

[382 rows x 2 columns]


In [23]:
filtered_combinations_df.to_csv('tfidf.txt', sep=' ', index=False)
filtered_combinations_df.to_csv('tf-idf_sorted.csv', index=False)
filtered_combinations_df.to_csv('tf-idf_sorted.txt', index=False)