In [209]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin
import json
from youtube_transcript_api import YouTubeTranscriptApi
import string
import contractions
from sklearn.metrics import confusion_matrix, classification_report
from googleapiclient.discovery import build
import time
import os
import sys

### Key word Extractring 

In [210]:
API_KEY = 'AIzaSyBG7X8oJ1CYdPZd7F4gSo605Jf-EfD7IHM'

In [211]:
def get_video_statistics(video_id): 
    # video_id = '7cPLbiblb84'
    try:
        youtube = build('youtube', 'v3', developerKey=API_KEY)

        request = youtube.videos().list(
            part=['snippet','statistics'],
            id=video_id
        )
        response = request.execute()
        stats = response['items'][0]['statistics']
        viewCount = stats['viewCount'] if 'viewCount' in stats.keys() else 0
        likeCount = stats['likeCount'] if 'likeCount' in stats.keys() else 0
        commentCount = stats['commentCount'] if 'commentCount' in stats.keys() else 0 
        channelId = response['items'][0]['snippet']['channelId']
        channel_name = response['items'][0]['snippet']['channelTitle']

        print(response['items'][0]['statistics'],channelId,channel_name)
        
        return int(viewCount),int(likeCount),int(commentCount),channel_name,channelId
    except Exception as e:
        print(e)

def get_channel_statistics(channelId):
    try:
        youtube = build('youtube', 'v3', developerKey=API_KEY)
        request_channel =  youtube.channels().list(
            part=['statistics'],
            id=channelId
        )
        response_channel = request_channel.execute()
        subscriberCount= response_channel['items'][0]['statistics']['subscriberCount']
        return int(subscriberCount)
    except Exception as e:
        print(e)

def cleaning_sentence(text):
    text = text.lower()
    PUNCT_TO_REMOVE = string.punctuation
    ans = contractions.fix(text).translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    ans = ans.replace('music','')
    ans = ans.replace("  ",' ')
    ans = " ".join(ans.split())
    return ans


In [212]:
test_video_ids = [
    'p0MvovsCxCk',
    '7cPLbiblb84',
    'rm_j6O8y148',
    'Lv0PkSkKeSo',
    '5zA6OFpkPe0',
    'Oz18u64bM8I',
    'p0MvovsCxCk',
    '7cPLbiblb84'
]

In [213]:
root = os.getcwd()
model_path = os.path.join(root.split('notebooks')[0], 'models', 'ner_mod','model-best')
nlp_ner = spacy.load(model_path)
nlp     = spacy.load("en_core_web_sm")
ner     = dict()
    
for video_id in test_video_ids:
    try:
        print(video_id)
        
        viewCount,likeCount,commentCount,channel_name,channelId = get_video_statistics(video_id)
        subscriberCount = get_channel_statistics(channelId)

        root = os.getcwd()
        file_path = os.path.join(root.split('notebooks')[0], 'data', 'transcripts')

        file = open(f'{file_path}/transcript_{video_id}.txt',"r")
        file_c = open(f'{file_path}/transcript_{video_id}.txt',"r")
        file_string = file_c.read()
        for text in file:
            doc_ner = nlp_ner(cleaning_sentence(text)) 

            for ent in doc_ner.ents:
                # print(ent,'->', ent.label_)
                
                doc = nlp(ent.text)

                rating = file_string.count(ent.text) + ((viewCount+likeCount+commentCount+subscriberCount)/(subscriberCount))
        
                lemma = ' '.join([token.lemma_ for token in doc])
                if lemma in ner.keys():
                    # ner[ent.label_].append(ent.text)
                    ner[lemma]['ENT'].add(ent.label_)
                    ner[lemma]['rating'] = (rating+ner[lemma]['rating'])/2
                    ner[lemma]['suggested_by'].add(channel_name)
                    ner[lemma]['video_id'].add(video_id)
                else:
                    # ner[ent.label_] = [ent.text]
                    ner[lemma] = {
                        'word': ent.text,
                        'ENT': set([ent.label_]),
                        # 'keywords': list,
                        'suggested_by': set([channel_name]),
                        'rating': rating,
                        'video_id':set([video_id])
                    }
            # print('--x-x-x--x-x-x-x-x-x-x---x-x-x--')
    except Exception as e:
        print(e)
# tokkenization()
df = pd.DataFrame(ner)
df = df.T
df = df.reset_index().rename(columns={'index':'lemma_word'})

df['ENT_len'] = df['ENT'].apply(lambda x: len(list(x))) 
df['ENT_max'] = df['ENT'].apply(lambda x: (list(x))[0])

df[df['lemma_word']!=df['word']]

df_base_info = df[['lemma_word','word','suggested_by','video_id']]
df_entity_info = df[['lemma_word','rating','ENT_max']]

df_base_info = df_base_info.explode(['suggested_by','video_id']).drop_duplicates().reset_index()
del df_base_info['index']
df_base_info['key_word_extracted'] = 0
df_base_info.head(2)


p0MvovsCxCk
{'viewCount': '281079', 'favoriteCount': '0', 'commentCount': '97'} UCoCU-RvDqV1JgmbRgh79UTQ Singhsman 
7cPLbiblb84
{'viewCount': '2031633', 'likeCount': '16193', 'favoriteCount': '0', 'commentCount': '864'} UC_P_sA6Jf3iSsFueMwIP3vg TheSocialTraveller
rm_j6O8y148
{'viewCount': '189021', 'likeCount': '2490', 'favoriteCount': '0', 'commentCount': '91'} UCQ0X2x6lozB7qG30ctFeUiA Saturday Shooters
Lv0PkSkKeSo
{'viewCount': '116809', 'likeCount': '2215', 'favoriteCount': '0', 'commentCount': '188'} UC2N5r2FvEOiDPKvaLW9rivw Aniruddha Patil
5zA6OFpkPe0
{'viewCount': '10198', 'likeCount': '201', 'favoriteCount': '0', 'commentCount': '33'} UCCq9CLWwVP4fdYy2N_ypTjg Sisters vs Globe
Oz18u64bM8I
{'viewCount': '9929', 'likeCount': '169', 'favoriteCount': '0', 'commentCount': '36'} UC-w8ULGFYLrjbdsZ6twWO3Q Travel Tales
p0MvovsCxCk
{'viewCount': '281079', 'favoriteCount': '0', 'commentCount': '97'} UCoCU-RvDqV1JgmbRgh79UTQ Singhsman 
7cPLbiblb84
{'viewCount': '2031636', 'likeCount': '16193

Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted
0,sindhi dal,sindhi dal,Singhsman,p0MvovsCxCk,0
1,kudi chhola,kudi chhola,Singhsman,p0MvovsCxCk,0


In [None]:
# df_base_info = df[['lemma_word','word','suggested_by','video_id']]
# df_entity_info = df[['lemma_word','rating','ENT_max']]

# df_base_info = df_base_info.explode(['suggested_by','video_id']).drop_duplicates().reset_index()
# del df_base_info['index']
# df_base_info['key_word_extracted'] = 0
# df_base_info.head(2)

Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted
0,sindhi dal,sindhi dal,Singhsman,p0MvovsCxCk,0
1,kudi chhola,kudi chhola,Singhsman,p0MvovsCxCk,0


In [253]:
def get_context_window(id, text, window_size=50):
    root = os.getcwd().split('wanderly.ai')[0]
    file_path = os.path.join(root, f'wanderly.ai/data/transcripts/')
    file = open(file_path + f'transcript_{id}.txt',"r")
    
    file_string = file.read()
    print(file_path + f'transcript_{id}.txt')
    # loc = 'city palace'
    start = file_string.find(text)
    end = file_string.rfind(text)
    context_text = (file_string[start-window_size:end+window_size])
    return context_text

In [254]:
df_base_info['details'] = df_base_info.apply(lambda x: get_context_window(x['video_id'],x['word'],100),axis=1)


d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_5zA6OFpkPe0.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_rm_j6O8y148.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_Oz18u64bM8I.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_7cPLbiblb84.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_Lv0PkSkKeSo.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/transcripts/transcript_p0MvovsCxCk.txt
d:\randomProjects\wanderly.ai/data/tra

In [255]:

nlp = spacy.load("en_core_web_sm")
def process_text(text):
    """
    Removes stop words and lemmatizes the input text.

    Args:
        text (str): The input text.

    Returns:
        str: The processed text with stop words removed and lemmatized.
    """
    
    doc = nlp(text)
    filtered_and_lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and len(token.text)>2]
    index = 1
    while index<len(filtered_and_lemmatized_tokens):
        if filtered_and_lemmatized_tokens[index] == filtered_and_lemmatized_tokens[index-1]:
            del filtered_and_lemmatized_tokens[index-1]
        else:
            index = index+1

    return " ".join(filtered_and_lemmatized_tokens).replace('\n','')


In [256]:
df_base_info['clean_test'] = df_base_info['details'].apply(process_text)
df_base_info


Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted,details,clean_test
0,sindhi dal,sindhi dal,Singhsman,p0MvovsCxCk,0,n have come well in the morning time very much...,come morning time breakfast center breakfast g...
1,kudi chhola,kudi chhola,Singhsman,p0MvovsCxCk,0,st of our sindhis you have heard that dal dish...,sindhis hear dal dish sindhis people feel good...
2,rice,rice,Singhsman,p0MvovsCxCk,0,at dal dish in sindhis people feel very good a...,dal dish sindhis people feel good absolutely k...
3,hai pri,hai pri,Singhsman,p0MvovsCxCk,0,i rice is ok these three or four things are ma...,rice thing maintain morning pricing hai pri si...
4,tamarind sauce,tamarind sauce,Singhsman,p0MvovsCxCk,0,add ok so the lentils go but you have a little...,add lentil little bit chickpea onion tamarind ...
...,...,...,...,...,...,...,...
191,smriti van,smriti van,Sisters vs Globe,5zA6OFpkPe0,0,ht and sound show you more fun\ncome\nand it i...,sound fun come unwt 720 morning ring smriti va...
192,reimber,reimber,Sisters vs Globe,5zA6OFpkPe0,0,walk then any animal also means hands can also...,walk animal mean hand situation leopard reimbe...
193,ram pura,ram pura,Travel Tales,Oz18u64bM8I,0,see very beautiful and very great place and yo...,beautiful great place jaipur second day daya r...
194,maggi,maggi,Travel Tales,Oz18u64bM8I,0,appreciation\nthat a appreciation that a do it...,appreciation happen right stay maggi right str...


In [257]:
df_base_info = df_base_info.merge(df_entity_info,on=['lemma_word'],how='inner')
landmarks = df_base_info[df_base_info['ENT_max'].isin(['LANDMARK','AREA'])].copy().reset_index().drop(columns='index',axis=1)
food = df_base_info[df_base_info['ENT_max'].isin(['FOOD','FOOD SHOP'])].copy().reset_index().drop(columns='index',axis=1)
# df_base_info = df_base_info[df_base_info['ENT_max'].isin(['LANDMARK','AREA','FOOD','FOOD SHOP'])].copy().reset_index().drop(columns='index',axis=1)

In [258]:
df_base_info = df_base_info[df_base_info['ENT_max'].isin(['LANDMARK','AREA','FOOD','FOOD SHOP'])].copy().reset_index().drop(columns='index',axis=1)

In [259]:
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=0.28)
# dim = cv.fit_transform(food['clean_test'])
# cv_df = pd.DataFrame(dim.toarray(),columns=cv.get_feature_names_out())
# print(cv_df.columns)
# cv_df

In [260]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvid_loc = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=0.2,min_df=0.05)
X_locations = tvid_loc.fit_transform(landmarks['clean_test'])
tvid_loc_df = pd.DataFrame(X_locations.toarray(),columns=tvid_loc.get_feature_names_out())

tvid_food = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=0.3,min_df=0.15)
X_food = tvid_food.fit_transform(food['clean_test'])
tvid_food_df = pd.DataFrame(X_food.toarray(),columns=tvid_food.get_feature_names_out())

# tvid_keyWords = TfidfVectorizer(stop_words='english',ngram_range=(1,2),min_df=0.15)
# X_keyWords = tvid_keyWords.fit_transform(df_base_info['clean_test'])
# tvid_keyWords_df = pd.DataFrame(X_keyWords.toarray(),columns=tvid_keyWords.get_feature_names_out())

print(tvid_food_df.columns)
print(tvid_loc_df.columns)
# print(tvid_keyWords_df.columns)

# df_base_info = pd.concat([df_base_info,tvid_keyWords_df],axis=1)
# df_base_info.head(2)

Index(['addition', 'addition samosa', 'address', 'address remain', 'ago',
       'area', 'arora', 'art', 'bhandar', 'big',
       ...
       'today', 'today shankar', 'tomato', 'try', 'uncle', 'vegetable',
       'wonderful', 'year old', 'year run', 'yes'],
      dtype='object', length=136)
Index(['1734', '953', '953 window', 'accord', 'accuracy', 'accuracy second',
       'address', 'albert', 'amar', 'amar fort',
       ...
       'water', 'water time', 'way', 'weapon', 'window', 'work', 'world',
       'world large', 'write', 'year'],
      dtype='object', length=301)


In [261]:
# H = pd.DataFrame(H,index=['Snaks','Evening','Desert','Tikka'])
# W = pd.DataFrame(W,columns=['Snaks','Evening','Desert','Tikka'])

In [262]:
from sklearn.decomposition import NMF
nmf_model_loc = NMF(n_components=7)
W_locations = nmf_model_loc.fit_transform(X_locations)
H_locations = nmf_model_loc.components_

nmf_model_food = NMF(n_components=4)
W_food = nmf_model_food.fit_transform(X_food)
H_food = nmf_model_food.components_

def display_topic(H,tvid):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:10]
        top_words = [tvid.get_feature_names_out()[i] for i in top_features]
        print(f"Topic {topic_num +1}: {top_words}")

display_topic(H_locations,tvid_loc)
display_topic(H_food,tvid_food)
# H = pd.DataFrame(H,index=['Forts','Generic','Temple','Temples & Architectur','Scientific Monuments','Lakeside Forts','Views'])
W_locations = pd.DataFrame(W_locations,columns=['Forts','Generic','Temple','Temples & Architectur','Scientific Monuments','Lakeside Forts','Views'])

# H = pd.DataFrame(H,index=['Forts','Generic','Temple','Temples & Architectur','Scientific Monuments','Lakeside Forts','Views'])
W_food = pd.DataFrame(W_food,columns=['Snaks','Evening','Desert','Tikka'])

food = pd.concat([food,W_food],axis=1)
landmarks = pd.concat([landmarks,W_locations],axis=1)

df_base_info = pd.concat([landmarks,food]).fillna(0)
df_base_info.head(2)


Topic 1: ['hawa', 'hawa mahal', 'floor', 'temple hawa', 'temple floor', 'window', 'temple second', 'ratan', 'prakash temple', 'second vichitra']
Topic 2: ['build', 'amer', 'amer fort', 'jaigarh', 'singh', 'jaigarh fort', 'king', 'nahargarh', 'courtyard', 'inr']
Topic 3: ['mandir', 'birla', 'park', 'dungri', 'moti dungri', 'moti', 'visit jaipur', 'amar', 'jain', 'similarly']
Topic 4: ['good', 'tea', 'year', 'start', 'address', 'father', 'food', 'way', 'morning', 'road']
Topic 5: ['jantar', 'mantar', 'jantar mantar', 'instrument', 'museum', 'city palace', 'mean', 'ticket', 'hall', 'hawa mahal']
Topic 6: ['lake', 'hai', 'jal mahal', 'jal', 'lake fort', 'box', 'description', 'description box', 'view', 'link description']
Topic 7: ['sun', 'little', 'toll', 'bada', 'climb', 'right', 'world large', 'large', 'world', 'lot']
Topic 1: ['samosa', 'start', 'sir', 'eat', 'chilli', 'address', 'remain', 'shankar samosa', 'shankar', 'good address']
Topic 2: ['evening', 'let', 'time', 'wonderful', 'fla

  df_base_info = pd.concat([landmarks,food]).fillna(0)


Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted,details,clean_test,rating,ENT_max,Forts,Generic,Temple,Temples & Architectur,Scientific Monuments,Lakeside Forts,Views,Snaks,Evening,Desert,Tikka
0,hai pri,hai pri,Singhsman,p0MvovsCxCk,0,i rice is ok these three or four things are ma...,rice thing maintain morning pricing hai pri si...,19.909299,LANDMARK,0.0,0.0,0.0,0.184352,0.0,0.162517,0.0,0.0,0.0,0.0,0.0
1,allredi trai,allredi trai,Singhsman,p0MvovsCxCk,0,e lentil and of tapre mirchi will keep my effo...,lentil tapre mirchi effort basically let thing...,19.909299,LANDMARK,0.0,0.0,0.0,0.07997,0.00858,0.080938,0.030002,0.0,0.0,0.0,0.0


In [263]:
tvid_keyWords = TfidfVectorizer(stop_words='english',ngram_range=(1,2),min_df=0.15)
X_keyWords = tvid_keyWords.fit_transform(df_base_info['clean_test'])
tvid_keyWords_df = pd.DataFrame(X_keyWords.toarray(),columns=tvid_keyWords.get_feature_names_out())

print(tvid_keyWords_df.columns)


Index(['build', 'city', 'come', 'day', 'food', 'fort', 'good', 'inside',
       'jaipur', 'king', 'know', 'like', 'look', 'lot', 'mahal', 'palace',
       'people', 'place', 'singh', 'temple', 'thing', 'time', 'timing', 'view',
       'visit', 'way', 'world', 'year'],
      dtype='object')


In [268]:
df_base_info = df_base_info.reset_index().drop(['index'],axis=1)

In [269]:
df_base_info = pd.concat([df_base_info,tvid_keyWords_df],axis=1)
df_base_info.head(2)

Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted,details,clean_test,rating,ENT_max,Forts,...,singh,temple,thing,time,timing,view,visit,way,world,year
0,hai pri,hai pri,Singhsman,p0MvovsCxCk,0,i rice is ok these three or four things are ma...,rice thing maintain morning pricing hai pri si...,19.909299,LANDMARK,0.0,...,0.0,0.0,0.907248,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,allredi trai,allredi trai,Singhsman,p0MvovsCxCk,0,e lentil and of tapre mirchi will keep my effo...,lentil tapre mirchi effort basically let thing...,19.909299,LANDMARK,0.0,...,0.0,0.0,0.667711,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# landmarks = pd.concat([landmarks,W],axis=1)


In [None]:
# food = pd.concat([food,W],axis=1)
# landmarks = pd.concat([landmarks,W],axis=1)
# df_base_info = pd.concat([landmarks,food]).fillna(0)
# df_base_info.head(2)

In [None]:
# df_base_info = pd.concat([landmarks,food]).fillna(0)
# df_base_info.head(2)

In [None]:
# def normalize_rateing(max_r,min_r,current_r):
#     return (current_r - min_r)/(max_r + min_r)

In [270]:
min_max_category_rateing = df_base_info.groupby(by='ENT_max')['rating'].agg(['min','max']).reset_index().rename(columns={'min':'min_category_rating','max':'max_category_rating'})
min_max_category_rateing

Unnamed: 0,ENT_max,min_category_rating,max_category_rating
0,AREA,2.372537,43.909299
1,FOOD,2.660253,35.640255
2,FOOD SHOP,2.372537,35.640255
3,LANDMARK,2.372537,52.639913


In [271]:
df_base_info = df_base_info.merge(min_max_category_rateing,on='ENT_max')
df_base_info['normalize_rating'] = (df_base_info['rating'] - df_base_info['min_category_rating'])/(df_base_info['max_category_rating'] + df_base_info['min_category_rating'])
df_base_info.drop(['min_category_rating','max_category_rating'],axis=1,inplace=True)
df_base_info


Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted,details,clean_test,rating,ENT_max,Forts,...,temple,thing,time,timing,view,visit,way,world,year,normalize_rating
0,hai pri,hai pri,Singhsman,p0MvovsCxCk,0,i rice is ok these three or four things are ma...,rice thing maintain morning pricing hai pri si...,19.909299,LANDMARK,0.0,...,0.0,0.907248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318778
1,allredi trai,allredi trai,Singhsman,p0MvovsCxCk,0,e lentil and of tapre mirchi will keep my effo...,lentil tapre mirchi effort basically let thing...,19.909299,LANDMARK,0.0,...,0.0,0.667711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318778
2,allredi,allredi,Singhsman,p0MvovsCxCk,0,e lentil and of tapre mirchi will keep my effo...,lentil tapre mirchi effort basically let thing...,20.909299,AREA,0.0,...,0.0,0.576889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.400519
3,nana ji,nana ji,Singhsman,p0MvovsCxCk,0,name sir gulshan bhatia sir our father started...,sir gulshan bhatia sir father start paneer goo...,19.909299,LANDMARK,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318778
4,vaccine,vaccines,Singhsman,p0MvovsCxCk,0,something when the pure cheese was with you a...,pure cheese blessing cheese introduction vacci...,19.909299,AREA,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.378912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,bharat,bharat,Aniruddha Patil,Lv0PkSkKeSo,0,go and go with that the being sed i will c yo...,se till den tech care bye jai hind jai bharat,2.372537,FOOD SHOP,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
175,aamir food,aamir food,Sisters vs Globe,5zA6OFpkPe0,0,comes to jaipur take the america will also sho...,come jaipur america thank brother appreciation...,2.660253,FOOD SHOP,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007569
176,reimber,reimber,Sisters vs Globe,5zA6OFpkPe0,0,walk then any animal also means hands can also...,walk animal mean hand situation leopard reimbe...,2.660253,FOOD,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
177,maggi,maggi,Travel Tales,Oz18u64bM8I,0,appreciation\nthat a appreciation that a do it...,appreciation happen right stay maggi right str...,3.013400,FOOD,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009220


In [272]:
df_base_info.columns

Index(['lemma_word', 'word', 'suggested_by', 'video_id', 'key_word_extracted',
       'details', 'clean_test', 'rating', 'ENT_max', 'Forts', 'Generic',
       'Temple', 'Temples & Architectur', 'Scientific Monuments',
       'Lakeside Forts', 'Views', 'Snaks', 'Evening', 'Desert', 'Tikka',
       'build', 'city', 'come', 'day', 'food', 'fort', 'good', 'inside',
       'jaipur', 'king', 'know', 'like', 'look', 'lot', 'mahal', 'palace',
       'people', 'place', 'singh', 'temple', 'thing', 'time', 'timing', 'view',
       'visit', 'way', 'world', 'year', 'normalize_rating'],
      dtype='object')

In [275]:
req_col = ['Forts', 'Generic',
       'Temple', 'Temples & Architectur', 'Scientific Monuments',
       'Lakeside Forts', 'Views', 'Snaks', 'Evening', 'Desert', 'Tikka',
       'normalize_rating', 'build', 'city', 'come', 'day', 'food',
       'fort', 'good', 'inside', 'jaipur', 'king', 'know', 'like', 'look',
       'lot', 'mahal', 'palace', 'people', 'place', 'singh', 'temple', 'thing',
       'time', 'timing', 'view', 'visit', 'way', 'world', 'year']

In [276]:
X = df_base_info[req_col]
Y = df_base_info[['lemma_word']]

In [None]:
'''
What excites you most about traveling?
Which of these travel quotes resonates with you the most?
If you had a 3-day break, what would you do?
What type of destinations do you find most appealing?
What kind of experiences do you look for while traveling?
What do you often post or talk about after your trips?
Which of these describe you best when traveling?
What's the first thing you usually do after reaching a new place?
Describe your dream travel day in 2–3 sentences.
'''

"\nWhat excites you most about traveling?\nWhich of these travel quotes resonates with you the most?\nIf you had a 3-day break, what would you do?\nWhat type of destinations do you find most appealing?\nWhat kind of experiences do you look for while traveling?\nWhat do you often post or talk about after your trips?\nWhich of these describe you best when traveling?\nWhat's the first thing you usually do after reaching a new place?\nDescribe your dream travel day in 2–3 sentences.\n"

In [277]:
user_input = ['I like to trave to find me, enjoy, capture the movement',
'Like to explore the culture, heritage and local food',
'relax chill with friends',
'peacefull place calm and relaxing',
'chill good local food instagram worthy locations, vloging',
'Explorer – I love wandering and getting lost',
'Explore the surroundings on foot',
'Dream travel worule be lost in the movement, relax, local food, chill vibes, haritage, mountain person but \
also like beaches and water side resorts']


In [278]:
user_input = ' '.join(user_input)
print(user_input)

I like to trave to find me, enjoy, capture the movement Like to explore the culture, heritage and local food relax chill with friends peacefull place calm and relaxing chill good local food instagram worthy locations, vloging Explorer – I love wandering and getting lost Explore the surroundings on foot Dream travel worule be lost in the movement, relax, local food, chill vibes, haritage, mountain person but also like beaches and water side resorts


In [279]:
user = pd.DataFrame({'name':['Jatin'],'desc':[user_input]})
user

Unnamed: 0,name,desc
0,Jatin,"I like to trave to find me, enjoy, capture the..."


In [280]:
user['clean_dec'] = user['desc'].apply(lambda x: process_text(cleaning_sentence(x)))
user

Unnamed: 0,name,desc,clean_dec
0,Jatin,"I like to trave to find me, enjoy, capture the...",like trave find enjoy capture movement like ex...


In [281]:
X_locations = tvid_loc.transform(user['clean_dec'])
tvid_loc_df = pd.DataFrame(X_locations.toarray(),columns=tvid_loc.get_feature_names_out())

X_food = tvid_food.transform(user['clean_dec'])
tvid_food_df = pd.DataFrame(X_food.toarray(),columns=tvid_food.get_feature_names_out())

X_keyWords = tvid_keyWords.transform(user['clean_dec'])
tvid_keyWords_df = pd.DataFrame(X_keyWords.toarray(),columns=tvid_keyWords.get_feature_names_out())
# tvid_keyWords_df = pd.DataFrame(X_keyWords.toarray(),columns=tvid_keyWords.get_feature_names_out())



In [282]:
tvid_keyWords_df

Unnamed: 0,build,city,come,day,food,fort,good,inside,jaipur,king,...,singh,temple,thing,time,timing,view,visit,way,world,year
0,0.0,0.0,0.0,0.0,0.699599,0.0,0.191613,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [283]:
W_locations = nmf_model_loc.transform(X_locations)
H_locations = nmf_model_loc.components_

W_food = nmf_model_food.transform(X_food)
H_food = nmf_model_food.components_

W_locations = pd.DataFrame(W_locations,columns=['Forts','Generic','Temple','Temples & Architectur','Scientific Monuments','Lakeside Forts','Views'])

# H = pd.DataFrame(H,index=['Forts','Generic','Temple','Temples & Architectur','Scientific Monuments','Lakeside Forts','Views'])
W_food = pd.DataFrame(W_food,columns=['Snaks','Evening','Desert','Tikka'])


In [284]:
type(W_locations)

pandas.core.frame.DataFrame

In [285]:
type(W_food)

pandas.core.frame.DataFrame

In [286]:
user_transform = pd.concat([user,tvid_keyWords_df,W_locations,W_food],axis=1)
user_transform

Unnamed: 0,name,desc,clean_dec,build,city,come,day,food,fort,good,...,Generic,Temple,Temples & Architectur,Scientific Monuments,Lakeside Forts,Views,Snaks,Evening,Desert,Tikka
0,Jatin,"I like to trave to find me, enjoy, capture the...",like trave find enjoy capture movement like ex...,0.0,0.0,0.0,0.0,0.699599,0.0,0.191613,...,0.025911,0.0,0.108836,0.040124,0.055411,0.003221,0.051871,0.144062,0.00303,0.011624


In [301]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

def get_similar_objects(check,X,Y,top_n=5):
    common = [i for i in X.columns if i in check.columns]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X[common])
    user_scaled = scaler.transform(check[common])
    
    # Compute cosine similarity between user vector and all item vectors
    similarity_scores = cosine_similarity(user_scaled, X[common])[0]

    # Add scores to DataFrame
    Y['similarity'] = similarity_scores

    # Sort top recommendations
    top_recommendations = Y.sort_values(by='similarity', ascending=False).head(10)
    return top_recommendations

In [302]:
get_similar_objects(user_transform,X,Y,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['similarity'] = similarity_scores


Unnamed: 0,lemma_word,similarity
128,subscribe,0.633513
7,sodala,0.598635
178,arun bhai,0.591013
40,chowki dhani,0.558696
166,dhaba,0.492874
127,sambar lake,0.462036
8,gulab ji,0.447595
163,chaat,0.392232
175,aamir food,0.376524
5,raja park,0.360683


In [295]:
df_base_info[df_base_info['lemma_word']=='sambar lake']

Unnamed: 0,lemma_word,word,suggested_by,video_id,key_word_extracted,details,clean_test,rating,ENT_max,Forts,...,temple,thing,time,timing,view,visit,way,world,year,normalize_rating
127,sambar lake,sambar lake,Aniruddha Patil,Lv0PkSkKeSo,0,m jaipur a distance of 75 km but indias bigges...,jaipur distance india big salt settle water la...,6.372537,LANDMARK,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.456381,0.072711


In [296]:
get_similar_objects(df_base_info[df_base_info['lemma_word']=='chowki dhani'],X,Y,5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['similarity'] = similarity_scores


Unnamed: 0,lemma_word,similarity
40,chowki dhani,1.0
7,sodala,0.825258
166,dhaba,0.771376
56,nahargarh,0.639082
8,gulab ji,0.581485
153,shankar samosa,0.564242
6,shimla tutter,0.540364
177,maggi,0.537956
16,hawa mahal,0.537956
141,tamarind sauce,0.52352


In [None]:
top_recommendations = Y.sort_values(by='similarity', ascending=False).head(10)
top_recommendations

Unnamed: 0,lemma_word,similarity
27,pune,0.656606
85,zenana deodi,0.61318
9,nana ji,0.607636
49,mubarak mahal,0.546297
180,chaat,0.546149
50,palace of reception,0.519179
195,arun bhai,0.513964
28,hawa mahal,0.482137
178,raman dosa,0.38007
57,amer fort,0.370288


In [None]:
# Re-import required modules after environment reset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

# Define the main function to recommend keywords
def recommend_keywords_for_user(
    user_input,
    df_base_info,
    tfidf_models,
    nmf_models,
    keyword_vectorizer,
    process_text_func,
    req_col,
    topic_labels,
    normalize=True,
    top_n=10
):
    """
    Full keyword recommendation pipeline from raw user input.

    Args:
        user_input (str): Raw user description or answers.
        df_base_info (pd.DataFrame): DataFrame containing features and metadata.
        tfidf_models (dict): Dictionary with 'location' and 'food' TfidfVectorizers.
        nmf_models (dict): Dictionary with 'location' and 'food' NMF models.
        keyword_vectorizer: TF-IDF vectorizer used for general keywords.
        process_text_func (callable): Text cleaning and lemmatization function.
        req_col (list): Final column names used in cosine similarity.
        topic_labels (dict): {'location': [...], 'food': [...]}
        normalize (bool): Whether to scale all features.
        top_n (int): Number of recommendations to return.

    Returns:
        pd.DataFrame: Top N recommended keywords/entities.
    """
    # 1. Clean and prepare user input
    user_clean = process_text_func(user_input)

    # 2. Transform into TF-IDF vectors
    X_loc = tfidf_models['location'].transform([user_clean])
    X_food = tfidf_models['food'].transform([user_clean])
    X_key = keyword_vectorizer.transform([user_clean])

    # 3. Apply NMF topic models
    W_loc = nmf_models['location'].transform(X_loc)
    W_food = nmf_models['food'].transform(X_food)

    # 4. Build user feature vector
    user_vector = np.concatenate([
        W_loc[0],
        W_food[0],
        [0.5],  # normalize_rating neutral
        X_key.toarray()[0][:len(req_col) - (len(W_loc[0]) + len(W_food[0]) + 1)]
    ])
    user_vector = pd.DataFrame([user_vector], columns=req_col[:len(user_vector)])

    # 5. Feature scaling (optional)
    if normalize:
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(df_base_info[req_col])
        user_scaled = scaler.transform(user_vector)
    else:
        X_scaled = df_base_info[req_col].values
        user_scaled = user_vector.values

    # 6. Cosine similarity
    similarity_scores = cosine_similarity(user_scaled, X_scaled)[0]
    df_result = df_base_info.copy()
    df_result['score'] = similarity_scores

    # 7. Return top N
    return df_result.sort_values(by='score', ascending=False).head(top_n)[
        ['lemma_word', 'ENT_max', 'video_id', 'suggested_by', 'score']
    ]
