In [30]:
import nltk
from nltk.stem import WordNetLemmatizer
import re
import string
import pandas as pd

df = pd.read_csv("data/raw/train.csv")
i = nltk.corpus.stopwords.words('english')
j = list(string.punctuation)

stopwords = set(i).union(j)

wordnet_lemmatizer = WordNetLemmatizer()

def preprocess(row):
    new_row = []
    row = re.sub('[^a-z\s]', '', row.lower())                  # get rid of noise
    row = [w for w in row.split() if w not in set(stopwords)]  # remove stopwords
    for word in row:
        new_word = wordnet_lemmatizer.lemmatize(word)
        new_row.append(new_word)
    return " ".join(new_row)

df = df[pd.isnull(df.text) == False] # drop rows with no text
df["cleaned_text"] = df["text"].apply(preprocess)
df = df[["cleaned_text", "label"]]

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

# df.cleaned_text = df.cleaned_text.str.join(sep = " ")
cv = CountVectorizer()
tf_df = cv.fit_transform(df.cleaned_text)

X_train, X_test, y_train, y_test = train_test_split(tf_df, df.label)
gauss = GaussianNB()
gauss.fit(X_train.toarray(), y_train)

In [29]:
len(cv.vocabulary_)

1988

In [32]:
gauss.score(X_test.toarray(), y_test)

0.8272009246773261

In [33]:
from sklearn.metrics import confusion_matrix
y_pred = gauss.predict(X_test.toarray())
confusion_matrix(y_test, y_pred)

array([[2410,  203],
       [ 694, 1884]])

### Test area

In [2]:
tdf = df
tdf["cleaned_text"] = tdf["cleaned_text"].apply(lambda row: row.split(" ")) # only if it wasn't split to list of strings
tdf = tdf.explode("cleaned_text")
tdf = tdf.groupby(by=["label", "cleaned_text"])["cleaned_text"].count()
tdf = pd.DataFrame(tdf)
tdf = tdf.rename(columns = {"cleaned_text" : "word_count"})
tdf = tdf.reset_index(names = ["label", "cleaned_text"])

In [3]:
tdf

Unnamed: 0,label,cleaned_text,word_count
0,0,,1
1,0,aa,13
2,0,aaa,16
3,0,aaaaah,1
4,0,aaaahhh,1
...,...,...,...
209280,1,zygar,3
209281,1,zylinderkopfdichtung,1
209282,1,zymon,1
209283,1,zytsov,1


In [4]:
df

Unnamed: 0,cleaned_text,label
0,"[house, dem, aide, didnt, even, see, comeys, l...",1
1,"[ever, get, feeling, life, circle, roundabout,...",0
2,"[truth, might, get, fired, october, tension, i...",1
3,"[video, civilian, killed, single, u, airstrike...",1
4,"[print, iranian, woman, sentenced, six, year, ...",1
...,...,...
20795,"[rapper, unloaded, black, celebrity, met, dona...",0
20796,"[green, bay, packer, lost, washington, redskin...",0
20797,"[macys, today, grew, union, several, great, na...",0
20798,"[nato, russia, hold, parallel, exercise, balka...",1


In [None]:
df.cleaned_text = df.cleaned_text.str.join(sep = " ")
cv = CountVectorizer()
tf_df = cv.fit_transform(df.cleaned_text)

gauss = GaussianNB()
gauss.fit(tf_df.toarray(), df.label)

In [35]:
from sklearn.pipeline import make_pipeline



model = make_pipeline([CountVectorizer(), GaussianNB()])
model.fit(df["cleaned_text"], df["label"])

TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '[CountVectorizer(), GaussianNB()]' (type <class 'list'>) doesn't

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix()

In [None]:
model.score()