In [107]:
import pandas as pd
import pickle as pkl
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import csv

# df = pd.read_csv('data/ChildrenQueries.csv', skipinitialspace=True)
df = pd.read_csv('data/ChildrenQueries.csv',quoting=csv.QUOTE_ALL)
df

Unnamed: 0,Query,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,How did paul die from fast in the fouris\t1,,,,,,,
1,Fire belly toad\t1,,,,,,,
2,Now im stressed out\t1,,,,,,,
3,fast in the fouris\t1,,,,,,,
4,How many phones sell in a day\t1,,,,,,,
...,...,...,...,...,...,...,...,...
598,superbowl sunday?\t0,,,,,,,
599,superhry cz\t0,,,,,,,
600,superior motors\t0,,,,,,,
601,supporta Java oppure\t0,,,,,,,


In [108]:
df = df.drop(columns=['Unnamed: 1', "Unnamed: 2", "Unnamed: 3", "Unnamed: 4", "Unnamed: 5", "Unnamed: 6", 'Unnamed: 7'])

df

Unnamed: 0,Query
0,How did paul die from fast in the fouris\t1
1,Fire belly toad\t1
2,Now im stressed out\t1
3,fast in the fouris\t1
4,How many phones sell in a day\t1
...,...
598,superbowl sunday?\t0
599,superhry cz\t0
600,superior motors\t0
601,supporta Java oppure\t0


In [109]:
df['Query'] = df['Query'].str.slice(stop=-2)
df

Unnamed: 0,Query
0,How did paul die from fast in the fouris
1,Fire belly toad
2,Now im stressed out
3,fast in the fouris
4,How many phones sell in a day
...,...
598,superbowl sunday?
599,superhry cz
600,superior motors
601,supporta Java oppure


In [110]:
# Precomputing frequencies of each query

frequencies = df['Query'].value_counts()
df['QueryFrequency'] = df['Query'].apply(lambda x: frequencies[x])

df.to_csv('data/children-with-frequency.txt', sep='\t', index=False)

# Save frequencies in a txt file:
frequencies_df = pd.DataFrame({'Query': frequencies.index, 'Frequency': frequencies.values})
frequencies_df.to_csv('data/children-query-frequencies-precomputed.txt', sep='\t', index=False)

# Highest frequency queries (top 10)
print("Top 10 queries by frequency:")
frequencies.head(10)

Top 10 queries by frequency:


Query
youtube                             4
games                               3
Star wars                           2
weather                             2
best walking shoes for babies       1
best gps deals                      1
best love novels                    1
best professional digital camera    1
best rc truck                       1
best songs of the 80s and 90s       1
Name: count, dtype: int64

In [64]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andreea\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [112]:
# Precomputing tokens of each query
df['QueryTokens'] = df['Query'].apply(word_tokenize)

# df.to_csv('data/children-with-query-frequencies-and-tokens.txt', sep='\t', index=False)

# # Save tokens in a txt file:
# tokens_df = pd.DataFrame({'Query': df['Query'], 'Tokens': df['QueryTokens']})
# tokens_df.to_csv('data/children-query-tokens-precomputed.txt', sep='\t', index=False)

# df[df['AnonID'] == 479]
df

Unnamed: 0,Query,QueryFrequency,QueryTokens
0,How did paul die from fast in the fouris,1,"[How, did, paul, die, from, fast, in, the, fou..."
1,Fire belly toad,1,"[Fire, belly, toad]"
2,Now im stressed out,1,"[Now, im, stressed, out]"
3,fast in the fouris,1,"[fast, in, the, fouris]"
4,How many phones sell in a day,1,"[How, many, phones, sell, in, a, day]"
...,...,...,...
598,superbowl sunday?,1,"[superbowl, sunday, ?]"
599,superhry cz,1,"[superhry, cz]"
600,superior motors,1,"[superior, motors]"
601,supporta Java oppure,1,"[supporta, Java, oppure]"


In [113]:
# Creating one session for each query

sessions = []

for index, row in tqdm(df.iterrows(), desc='Creating sessions'):
    session = []
    session.append(row['Query'])
    sessions.append(session)

with open('data/children-sessions.pkl', 'wb') as f:
    pkl.dump(sessions, f)

print("First 10 sessions:")
for i in range(10):
    print(sessions[i])

Creating sessions: 603it [00:00, 17929.59it/s]

First 10 sessions:
['How did paul die from fast in the fouris']
['Fire belly toad']
['Now im stressed out']
['fast in the fouris']
['How many phones sell in a day']
['How long do toads live']
['amozan']
['Fast and the ferious']
["Dominic's early life in fast an ferious"]
['what is a cheeta']





In [114]:
# Split into train and test sets (75% train, 25% test)
from sklearn.model_selection import train_test_split

train, test = train_test_split(sessions, test_size=0.25, random_state=42)

# Save train and test sets
with open('data/children-train.pkl', 'wb') as f:
    pkl.dump(train, f)

with open('data/children-test.pkl', 'wb') as f:
    pkl.dump(test, f)

In [84]:
df

Unnamed: 0,Query,QueryFrequency,QueryTokens
0,How did paul die from fast in the fouris,1,"[How, did, paul, die, from, fast, in, the, fou..."
1,Fire belly toad,1,"[Fire, belly, toad]"
2,Now im stressed out,1,"[Now, im, stressed, out]"
3,fast in the fouris,1,"[fast, in, the, fouris]"
4,How many phones sell in a day,1,"[How, many, phones, sell, in, a, day]"
...,...,...,...
598,superbowl sunday?,1,"[superbowl, sunday, ?]"
599,superhry cz,1,"[superhry, cz]"
600,superior motors,1,"[superior, motors]"
601,supporta Java oppure,1,"[supporta, Java, oppure]"


In [121]:
df['QueryTokens'] = df['QueryTokens'].astype(str)
df['QueryTokens']

0      ['How', 'did', 'paul', 'die', 'from', 'fast', ...
1                              ['Fire', 'belly', 'toad']
2                       ['Now', 'im', 'stressed', 'out']
3                        ['fast', 'in', 'the', 'fouris']
4      ['How', 'many', 'phones', 'sell', 'in', 'a', '...
                             ...                        
598                         ['superbowl', 'sunday', '?']
599                                   ['superhry', 'cz']
600                               ['superior', 'motors']
601                       ['supporta', 'Java', 'oppure']
602    ['supreme', 'ventures', 'jamaica', 'lottery', ...
Name: QueryTokens, Length: 603, dtype: object

In [122]:
df.to_csv('data/children-with-query-frequencies-and-tokens.txt', sep='\t', index=False)

# Save tokens in a txt file:
tokens_df = pd.DataFrame({'Query': df['Query'], 'Tokens': df['QueryTokens']})
tokens_df.to_csv('data/children-query-tokens-precomputed.txt', sep='\t', index=False)


In [123]:
# Tf-idf scores of all words in the queries
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer
X = vectorizer.fit_transform(df['QueryTokens'])

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Create a dataframe with the feature names and their corresponding idf scores
idf_scores = pd.DataFrame({'Word': feature_names, 'IDF': vectorizer.idf_})

# Save the idf scores
idf_scores.to_csv('data/children-idf-scores-precomputed.txt', sep='\t', index=False)

idf_scores

Unnamed: 0,Word,IDF
0,13,6.710427
1,16,6.710427
2,2000,6.710427
3,2009,6.710427
4,2015,6.710427
...,...,...
1127,your,6.304962
1128,youtub,6.710427
1129,youtube,5.457664
1130,zavod,6.710427
