In [427]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.model_selection import GridSearchCV

from sklearn.utils import resample
from sklearn.metrics import roc_curve, roc_auc_score

In [428]:

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fatba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-d

True

In [429]:
df = pd.read_csv('AA_movie_train_data.csv')

In [430]:
df.head()

Unnamed: 0,Title,Genre,Description
0,Windup (2006),action,Windup is a very original comedy. We follow B...
1,Hitman (2014),action,"Rana and Shuvo, two siblings, are very differ..."
2,Taken by Force (2010),action,When a San Francisco detective goes hunting f...
3,Bui doi Cho Lon (2013),action,"A man name Lam picks up his girlfriend, but i..."
4,Siam Yuth: The Dawn of the Kingdom (2015),action,Thap and Sin are a homeless musician brothers...


In [431]:
df['Genre'].value_counts()

Genre
action         1000
comedy         1000
documentary    1000
drama          1000
thriller       1000
Name: count, dtype: int64

In [432]:
df['Description'][0]

' Windup is a very original comedy. We follow Bernie Shaddick from his tragic childhood, through to his present day pursuit of a career as an inventor. Likened to such films as "Raising Arizona" and "The Big Lebowski", gritty cohen-esque humor saturates Bernie Shaddick\'s life, from his encounter with mace at a potential sale, to his murderous mentors posing as his Aunt Audre and Uncle Reg. Bernie is a man who, in the face of rejection and ridicule, just "doesn\'t get it", and perseveres anyways. His good nature gets on the nerves of his cruel co-workers at his commercial real estate day-job. Upholding an almost delusional enthusiasm, Bernie unknowingly sells his soul with a smile! Bernie\'s girlfriend Latrice and roommate Stuart only accentuate Bernie\'s seemingly pathetic existence. Latrice, sexy in a dirty sort of way, loves Bernie, although she would prefer to love his money, if he had any. Stuart, usually clad in underpants and obsessed with the rubix-cube, lives with Bernie becau

In [433]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet

In [434]:
import nltk
from nltk.corpus import stopwords as nltk_stopwords

def get_stop_words(stop_file_path, include_spanish=True):
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        custom_stopwords = set(line.strip().lower() for line in f if line.strip())
    
    if include_spanish:
        spanish_stopwords = set(nltk_stopwords.words('spanish'))
        custom_stopwords.update(spanish_stopwords)
    
    return frozenset(custom_stopwords)

In [435]:
from unidecode import unidecode

def clean_text(text):
    text = unidecode(text)                     # strip accents
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r"</?.*?>", " ", text)
    text = re.sub(r"(\d|\W|_)+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [436]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text)

In [437]:
def get_wordnet_pos(treebank_tag):
    """Convert TreeBank POS tags to WordNet POS tags"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun


In [438]:
from nltk.stem import WordNetLemmatizer

def lemmatize_with_pos(pos_tagged_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    
    for word, tag in pos_tagged_tokens:
        pos = get_wordnet_pos(tag)  # You already have this function
        lemma = lemmatizer.lemmatize(word, pos)
        lemmas.append(lemma)

    return lemmas


In [439]:
stopwords = get_stop_words("stopwords.txt", include_spanish=True)

# 🔹 3. Full processing pipeline function
def full_text_pipeline(text, stopwords):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    pos_tags = pos_tag(tokens)
    lemmas = lemmatize_with_pos(pos_tags)
    filtered = [lemma for lemma in lemmas if lemma not in stopwords]
    return filtered, ' '.join(filtered)

# 🔹 4. Apply to DataFrame
# Apply function and unpack results
df[['description_tokens', 'description_string']] = df['Description'].apply(
    lambda x: pd.Series(full_text_pipeline(x, stopwords))
)

description_tokens = df['description_tokens'].apply(len)
description_string = df['description_string']
df[['description_string']].to_csv('processed_descriptions.csv', index=False)

In [440]:
average_tokens = description_tokens.mean()
average_tokens

68.1758

In [441]:
description_string[0]

'windup original comedy follow bernie shaddick tragic childhood day pursuit career inventor liken film raise arizona lebowski gritty cohen esque humor saturate bernie shaddick life encounter mace potential sale murderous mentor pose aunt audre uncle reg bernie rejection ridicule doesn persevere anyways nature nerve cruel co worker commercial real estate day job uphold delusional enthusiasm bernie unknowingly sell soul smile bernie girlfriend latrice roommate stuart accentuate bernie seemingly pathetic existence latrice sexy dirty sort love bernie prefer love money stuart usually clothe underpants obsess rubix cube live bernie allow pursue beer welfare check latrice myriad flashback eccentric aunt uncle learn mentor reg audre life strange habit murder meal wheel delivery boys worry bernie won inherit twisted trait aunt audre uncle reg actually squatter murder pose real family bernie dream invent product windup flashlight conjure laughter succeed conscience boy bernie hold flashlight gra

In [442]:
description_string.shape

(5000,)

In [443]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.16, max_features=5000)
description_bow_matrix = bow_vectorizer.fit_transform(description_string.tolist())

In [444]:
# Try different values and see vocabulary size
for max_df_val in [0.15, 0.2, 0.3, 0.5, 0.7, 0.9]:
    cv = CountVectorizer(max_df=max_df_val)
    cv.fit(description_string)
    print(f"max_df={max_df_val}: {len(cv.vocabulary_)} unique words")

max_df=0.15: 33540 unique words
max_df=0.2: 33549 unique words
max_df=0.3: 33551 unique words
max_df=0.5: 33552 unique words
max_df=0.7: 33552 unique words
max_df=0.9: 33552 unique words


In [445]:
sum_words = description_bow_matrix.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in bow_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [446]:
print('The highest frequency words:')
words_freq[0:20] 

The highest frequency words:


[('woman', 1110),
 ('people', 962),
 ('kill', 945),
 ('try', 903),
 ('own', 900),
 ('father', 886),
 ('girl', 879),
 ('leave', 847),
 ('meet', 824),
 ('help', 798),
 ('home', 762),
 ('mother', 755),
 ('wife', 741),
 ('begin', 713),
 ('start', 696),
 ('death', 673),
 ('brother', 670),
 ('police', 663),
 ('city', 659),
 ('lead', 655)]

In [447]:
print('The lowest frequency words:')
words_freq[-20:] 

The lowest frequency words:


[('translate', 10),
 ('thinker', 10),
 ('suppress', 10),
 ('tango', 10),
 ('rosario', 10),
 ('definition', 10),
 ('epidemic', 10),
 ('mississippi', 10),
 ('uber', 10),
 ('loop', 10),
 ('ruling', 10),
 ('depiction', 10),
 ('mohawk', 10),
 ('fatherhood', 10),
 ('manor', 10),
 ('concentration', 10),
 ('mick', 10),
 ('tibet', 10),
 ('toi', 10),
 ('agatha', 10)]

In [448]:
bow_vectorizer.get_feature_names_out()

array(['aaron', 'aarti', 'abandon', ..., 'zombie', 'zone', 'zorawar'],
      dtype=object)

In [449]:
feature_names=np.array(bow_vectorizer.get_feature_names_out())
len(bow_vectorizer.get_feature_names_out())

5000

In [450]:
pd.DataFrame(description_bow_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [451]:
df_bow = pd.DataFrame(description_bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
df_bow.to_csv('asg1_bow_export.csv', index=False)

### 1.4 TF-IDF

In [452]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True)
description_tfidf = tfidf_transformer.fit_transform(description_bow_matrix)

In [453]:
tfidf_transformer.idf_

array([7.1194979 , 7.7256337 , 4.68111126, ..., 5.56614945, 6.29851735,
       8.82424599])

In [454]:
tfidf_transformer.idf_.shape

(5000,)

In [455]:
sorted_by_idf = np.argsort(tfidf_transformer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:100]]))

Features with lowest idf:
['own' 'woman' 'try' 'people' 'meet' 'leave' 'help' 'kill' 'begin' 'start'
 'home' 'father' 'decide' 'lead' 'girl' 'follow' 'force' 'look' 'wife'
 'death' 'set' 'name' 'tell' 'bring' 'mother' 'city' 'soon' 'run' 'fall'
 'call' 'fight' 'child' 'daughter' 'police' 'return' 'change' 'night'
 'brother' 'discover' 'town' 'murder' 'lose' 'local' 'house' 'past'
 'documentary' 'secret' 'learn' 'plan' 'play' 'school' 'move' 'dream'
 'job' 'journey' 'save' 'series' 'money' 'happen' 'struggle'
 'relationship' 'movie' 'include' 'real' 'country' 'escape' 'war' 'hand'
 'head' 'die' 'win' 'character' 'event' 'american' 'attempt' 'realize'
 'break' 'sister' 'boy' 'husband' 'dead' 'student' 'stop' 'bad' 'power'
 'human' 'age' 'team' 'true' 'little' 'question' 'experience' 'former'
 'create' 'search' 'evil' 'believe' 'drug' 'beautiful' 'hold']


In [456]:
pd.DataFrame(description_tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [457]:
# find maximum value for each of the features over dataset:
max_value = description_tfidf.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))

print("Features with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

Features with lowest tfidf:
['praise' 'pound' 'tease' 'dire' 'endanger' 'obsessive' 'conceal' 'injure'
 'kapoor' 'operate' 'seriously' 'thirsty' 'dc' 'actually' 'flees'
 'resemble' 'enthusiasm' 'flip' 'seventeen' 'verma']
Features with highest tfidf: 
['jo' 'shanichari' 'kishan' 'coral' 'jaggu' 'yoga' 'jaswant' 'devin' 'di'
 'perry' 'volcano' 'pierre' 'sreeram' 'rider' 'selma' 'laundry' 'wine'
 'juan' 'comme' 'mar']


## 2. Text Data Understanding

### 2.1 Extract keywords using TF-IDF matrix

In [458]:
idx=0
doc = description_string[idx]
print(doc)

tf_idf_vector=description_tfidf[idx]
print(tf_idf_vector)

windup original comedy follow bernie shaddick tragic childhood day pursuit career inventor liken film raise arizona lebowski gritty cohen esque humor saturate bernie shaddick life encounter mace potential sale murderous mentor pose aunt audre uncle reg bernie rejection ridicule doesn persevere anyways nature nerve cruel co worker commercial real estate day job uphold delusional enthusiasm bernie unknowingly sell soul smile bernie girlfriend latrice roommate stuart accentuate bernie seemingly pathetic existence latrice sexy dirty sort love bernie prefer love money stuart usually clothe underpants obsess rubix cube live bernie allow pursue beer welfare check latrice myriad flashback eccentric aunt uncle learn mentor reg audre life strange habit murder meal wheel delivery boys worry bernie won inherit twisted trait aunt audre uncle reg actually squatter murder pose real family bernie dream invent product windup flashlight conjure laughter succeed conscience boy bernie hold flashlight gran

In [459]:
tf_idf_vector.shape

(1, 5000)

In [460]:
temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data),columns=['feature_number','tf_idf'])
temp

Unnamed: 0,feature_number,tf_idf
0,58,0.032735
1,146,0.033600
2,252,0.047520
3,322,0.172975
4,401,0.042489
...,...,...
95,4747,0.041597
96,4889,0.098373
97,4897,0.044398
98,4949,0.065587


In [461]:
temp.sort_values('tf_idf', ascending = False, inplace = True)
temp

Unnamed: 0,feature_number,tf_idf
5,430,0.853499
3,322,0.172975
93,4672,0.144446
74,3599,0.103692
88,4288,0.101146
...,...,...
52,2572,0.024936
19,1099,0.023812
37,1761,0.022122
39,1895,0.022085


In [462]:
#use only topn items from vector
topn = 15
topn_items = temp[:topn]

tf_idf = []
word = []

for index, row in topn_items.iterrows():
    fname = feature_names[int(row['feature_number'])]
    word.append(fname)
    tf_idf.append(round(row['tf_idf'], 3))    

In [463]:
print(doc, '\n')

result = dict(zip(word, tf_idf))
print(result)

windup original comedy follow bernie shaddick tragic childhood day pursuit career inventor liken film raise arizona lebowski gritty cohen esque humor saturate bernie shaddick life encounter mace potential sale murderous mentor pose aunt audre uncle reg bernie rejection ridicule doesn persevere anyways nature nerve cruel co worker commercial real estate day job uphold delusional enthusiasm bernie unknowingly sell soul smile bernie girlfriend latrice roommate stuart accentuate bernie seemingly pathetic existence latrice sexy dirty sort love bernie prefer love money stuart usually clothe underpants obsess rubix cube live bernie allow pursue beer welfare check latrice myriad flashback eccentric aunt uncle learn mentor reg audre life strange habit murder meal wheel delivery boys worry bernie won inherit twisted trait aunt audre uncle reg actually squatter murder pose real family bernie dream invent product windup flashlight conjure laughter succeed conscience boy bernie hold flashlight gran

In [464]:
topn = 15
results =[]

In [465]:
for idx, doc in description_string.items():
    #generate tf-idf for the given document
    tf_idf_vector=description_tfidf[idx]
    
    temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data),columns=['feature_number','tf_idf'])
    temp.sort_values('tf_idf', ascending = False, inplace = True)
    
    #use only topn items from vector
     
    topn_items = temp[:topn]

    tf_idf = []
    word = []

    for index, row in topn_items.iterrows():
        #print(int(row['feature_number']))
        fname = feature_names[int(row['feature_number'])]
        word.append(fname)
        tf_idf.append(round(row['tf_idf'], 3))

    result = dict(zip(word, tf_idf))
    
    results.append(result)


In [466]:
len(results)

5000

In [467]:
df['keywords'] = results
df.head()

Unnamed: 0,Title,Genre,Description,description_tokens,description_string,keywords
0,Windup (2006),action,Windup is a very original comedy. We follow B...,"[windup, original, comedy, follow, bernie, sha...",windup original comedy follow bernie shaddick ...,"{'bernie': 0.853, 'aunt': 0.173, 'uncle': 0.14..."
1,Hitman (2014),action,"Rana and Shuvo, two siblings, are very differ...","[rana, shuvo, sibling, father, police, constab...",rana shuvo sibling father police constable dea...,"{'rana': 0.677, 'brother': 0.224, 'crime': 0.1..."
2,Taken by Force (2010),action,When a San Francisco detective goes hunting f...,"[san, francisco, detective, hunt, cruel, asian...",san francisco detective hunt cruel asian crime...,"{'nelson': 0.542, 'violent': 0.222, 'detective..."
3,Bui doi Cho Lon (2013),action,"A man name Lam picks up his girlfriend, but i...","[name, lam, pick, girlfriend, reveal, girlfrie...",name lam pick girlfriend reveal girlfriend act...,"{'lam': 0.744, 'gangster': 0.386, 'martial': 0..."
4,Siam Yuth: The Dawn of the Kingdom (2015),action,Thap and Sin are a homeless musician brothers...,"[thap, homeless, musician, brother, village, r...",thap homeless musician brother village raid kh...,"{'ram': 0.768, 'prince': 0.34, 'kingdom': 0.23..."


In [468]:
import pandas as pd

# Extract only the keyword keys from the dictionary (ignore TF-IDF values)
token_lists = df['keywords'].apply(lambda x: list(x.keys()))

# Convert to DataFrame, automatically expands lists into columns
transactions_df = pd.DataFrame(token_lists.tolist())

# Save to CSV without index or header
transactions_df.to_csv('transactions.csv', index=False, header=False)


### 2.2 Association Rules Mining on keywords

In [469]:
transactions = pd.read_csv("transactions.csv", header=None)
transactions.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,bernie,aunt,uncle,real,stuart,welfare,cuban,invent,mentor,product,commercial,pose,estate,co,worker
1,rana,brother,crime,bravery,extent,police,shower,tall,tackle,promote,villain,arise,assume,sibling,brave
2,nelson,violent,detective,asian,francisco,device,san,terrorist,center,capture,partner,steal,gang,art,triad
3,lam,gangster,martial,artist,girlfriend,brother,kill,furious,fight,trap,reveal,stab,protection,guilty,lay
4,ram,prince,kingdom,village,plan,residence,stir,raid,villager,meet,frame,homeless,musician,determine,justice


In [470]:
transactions = [list(d.keys()) for d in results]
print(transactions)



In [471]:
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

# Assuming df['keywords'] contains dictionaries
transactions = df['keywords'].apply(lambda x: list(x.keys())).tolist()

# Now transactions is a list of lists of strings
te = TransactionEncoder()
data_encoded = te.fit_transform(transactions)

# Convert to DataFrame
data_encoded_df = pd.DataFrame(data_encoded, columns=te.columns_)

# Check shape
print(data_encoded_df.shape)


(5000, 5000)


In [472]:
# Step 1: Calculate total frequency of each keyword across all transactions
keyword_counts = data_encoded_df.sum().sort_values(ascending=False)

# Step 2: Select top 200 keywords
top_keywords = keyword_counts.head(100).index

# Step 3: Filter the encoded DataFrame to include only these columns
data_encoded_filtered = data_encoded_df[top_keywords]


In [473]:
print(data_encoded.sum(axis=1).mean())

14.9724


In [474]:
data_encoded_filtered.shape

(5000, 100)

In [475]:
data_encoded_filtered

Unnamed: 0,girl,father,brother,woman,mother,kill,school,police,wife,child,...,guy,ghost,jack,look,music,follow,officer,detective,lead,zombie
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [476]:
for support in [0.05, 0.01, 0.005, 0.003, 0.002]:
    itemsets = apriori(data_encoded_filtered, min_support=support, use_colnames=True)
    print(f"Support: {support}, Itemsets: {itemsets.shape[0]}")

Support: 0.05, Itemsets: 0
Support: 0.01, Itemsets: 64
Support: 0.005, Itemsets: 100
Support: 0.003, Itemsets: 105
Support: 0.002, Itemsets: 118


In [477]:
frequent_itemsets=apriori(data_encoded_filtered, min_support = 0.002, use_colnames = True)

In [478]:
frequent_itemsets[['support']].describe()

Unnamed: 0,support
count,118.0
mean,0.0113
std,0.006085
min,0.002
25%,0.0082
50%,0.0102
75%,0.01375
max,0.0292


In [479]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.0292,(girl)
1,0.0286,(father)
2,0.0266,(brother)
3,0.0264,(woman)
4,0.0254,(mother)
...,...,...
113,0.0034,"(police, officer)"
114,0.0026,"(daughter, wife)"
115,0.0020,"(husband, wife)"
116,0.0024,"(killer, murder)"


In [480]:
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules_l.to_csv("rulesl5.csv", index=None)
rules_l

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(girl),(school),0.0292,0.0234,0.0026,0.089041,3.805175,1.0,0.001917,1.072057,0.759374,0.052,0.067214,0.100076
1,(school),(girl),0.0234,0.0292,0.0026,0.111111,3.805175,1.0,0.001917,1.09215,0.754864,0.052,0.084375,0.100076
2,(father),(brother),0.0286,0.0266,0.0024,0.083916,3.15474,1.0,0.001639,1.062566,0.703126,0.045455,0.058882,0.087071
3,(brother),(father),0.0266,0.0286,0.0024,0.090226,3.15474,1.0,0.001639,1.067737,0.701681,0.045455,0.06344,0.087071
4,(father),(mother),0.0286,0.0254,0.0032,0.111888,4.405044,1.0,0.002474,1.097384,0.795746,0.062992,0.088742,0.118936
5,(mother),(father),0.0254,0.0286,0.0032,0.125984,4.405044,1.0,0.002474,1.111422,0.793133,0.062992,0.100251,0.118936
6,(father),(daughter),0.0286,0.0188,0.002,0.06993,3.719685,1.0,0.001462,1.054974,0.752687,0.044053,0.05211,0.088157
7,(daughter),(father),0.0188,0.0286,0.002,0.106383,3.719685,1.0,0.001462,1.087043,0.745169,0.044053,0.080073,0.088157
8,(brother),(kill),0.0266,0.0246,0.0024,0.090226,3.667706,1.0,0.001746,1.072134,0.747226,0.04918,0.067281,0.093893
9,(kill),(brother),0.0246,0.0266,0.0024,0.097561,3.667706,1.0,0.001746,1.078632,0.745694,0.04918,0.0729,0.093893
