In [1]:
import pandas as pd
df = pd.read_csv('train_data.csv')
df_sol = pd.read_csv('train_solution.csv')

In [2]:
df = df.merge(df_sol, on='id')

In [3]:
df.head()

Unnamed: 0,id,message,category
0,271828,Over $616 million in Bitcoin was electrocated ...,1
1,271829,Quiz: Thursday or friday?,0
2,271830,The Australian Revenue Authority will start co...,1
3,271831,Let's continue😉. I present to you my new review,2
4,271832,Here comes your future palette.,2


In [4]:
df.groupby('category').count()

Unnamed: 0_level_0,id,message
category,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1428,1428
1,1199,1199
2,1217,1217


In [5]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
from string import punctuation
punctuation = punctuation.replace('$', '')
punctuation += ' '
punctuation

'!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~ '

In [7]:
import re
regex = re.compile("[A-z$]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text)
    except:
        return []

In [8]:
def lemmatize(text):
    try:
        a = " ".join([word.lower() for word in text if word.lower() not in punctuation and word.lower() not in stopwords])
        return a
    except:
        return " " 

In [9]:
def clean_text(text):
    return lemmatize(words_only(text))

In [10]:
from multiprocessing import Pool
from tqdm import tqdm
with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df['message']), total=len(df)))

100%|██████████| 3844/3844 [00:00<00:00, 7188.60it/s]


In [11]:
df['lemmas'] = lemmas
df.head()

Unnamed: 0,id,message,category,lemmas
0,271828,Over $616 million in Bitcoin was electrocated ...,1,$ million bitcoin electrocated september wrapp...
1,271829,Quiz: Thursday or friday?,0,quiz thursday friday
2,271830,The Australian Revenue Authority will start co...,1,australian revenue authority start collecting ...
3,271831,Let's continue😉. I present to you my new review,2,let continue present new review
4,271832,Here comes your future palette.,2,comes future palette


In [12]:
df = df.drop(columns=['id', 'message'])

In [13]:
df.head()

Unnamed: 0,category,lemmas
0,1,$ million bitcoin electrocated september wrapp...
1,0,quiz thursday friday
2,1,australian revenue authority start collecting ...
3,2,let continue present new review
4,2,comes future palette


In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

X = df.lemmas.tolist()
y = df.category.tolist()

X, y = np.array(X), np.array(y)

X_train = X
y_train = y

In [15]:
with open('train.txt', 'w+') as outfile:
    for i in range(len(X_train)):
        outfile.write('__label__' + str(y_train[i]) + ' ' + X_train[i] + '\n')

In [16]:
import fasttext

classifier = fasttext.train_supervised('train.txt', dim=300, epoch=25, wordNgrams=2, loss='ova')

In [17]:
df_test = pd.read_csv('test_data.csv')

In [18]:
df_test.head()

Unnamed: 0,id,message
0,275672,But a lot of people have a job fair tonight.
1,275673,"Also, I got only 4 answers on the google form ..."
2,275674,"Vladimir, when will we have seminar?"
3,275675,"A couple at 111, too?"
4,275676,"It's on Anti-buying. And again, Zara:"


In [19]:
from multiprocessing import Pool
from tqdm import tqdm
with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df_test['message']), total=len(df_test)))

100%|██████████| 5927/5927 [00:00<00:00, 8625.33it/s] 


In [20]:
df_test['lemmas'] = lemmas
df_test.head()

Unnamed: 0,id,message,lemmas
0,275672,But a lot of people have a job fair tonight.,lot people job fair tonight
1,275673,"Also, I got only 4 answers on the google form ...",also got answers google form
2,275674,"Vladimir, when will we have seminar?",vladimir seminar
3,275675,"A couple at 111, too?",couple
4,275676,"It's on Anti-buying. And again, Zara:",anti buying zara


In [21]:
df.shape

(3844, 2)

In [22]:
df_test.shape

(5927, 3)

In [23]:
df_test = df_test.drop(columns=['id', 'message'])

In [24]:
with open('test_final.txt', 'w+') as outfile:
    for i in range(len(df_test)):
        outfile.write(df_test.iloc[i]['lemmas'] + '\n')

In [25]:
from tqdm import tqdm
preds_final = []
for i in tqdm(range(df_test.shape[0])):
    preds_final.append(classifier.predict(df_test.iloc[i]['lemmas'])[0][0][9:])

100%|██████████| 5927/5927 [00:02<00:00, 2279.69it/s]


In [26]:
preds_final[:5]

['0', '0', '0', '0', '2']

In [27]:
submission = pd.read_csv('sample_submission.csv')

In [28]:
submission['category'] = preds_final

In [29]:
submission.to_csv("sample_submission.csv", index=False)

это задание 2)))

In [35]:
classifier.get_nearest_neighbors('My future')[0][1]

'$'