In [1]:
import pandas as pd
from datetime import datetime
import nltk
import ast
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords

In [2]:
def preprocess(text):
    text = text.strip()
    result = ""
    punctless = re.sub('[^a-zA-Z]', ' ', text)
    lowercase = punctless.lower()
    words_list = lowercase.split()
    words_list =  [word for word in words_list if not word in set(stopwords.words('english'))]
    for word in words_list:
        result += " " + word.strip()
    return result

# Short samples preproc

In [6]:
raw_short = pd.read_csv("data/raw/spam-text-message.csv")

In [7]:
raw_short.insert( 2, "text", "Empty")
raw_short.insert( 3, "label", "Empty")

In [8]:
for ind in raw_short.index:
    if raw_short["text"][ind] == "Empty":
        message = raw_short["Message"][ind]
        raw_short["text"][ind] = preprocess(message)
    
    if raw_short["label"][ind] == "Empty":
        if raw_short["Category"][ind] == "ham":
            raw_short["label"][ind] = 0
        elif raw_short["Category"][ind] == "spam":
            raw_short["label"][ind] = 1

In [9]:
processed_short = raw_short[["text","label"]]
processed_short.dropna()

Unnamed: 0,text,label
0,go jurong point crazy available bugis n great...,0
1,ok lar joking wif u oni,0
2,free entry wkly comp win fa cup final tkts st...,1
3,u dun say early hor u c already say,0
4,nah think goes usf lives around though,0
...,...,...
5567,nd time tried contact u u pound prize claim e...,1
5568,b going esplanade fr home,0
5569,pity mood suggestions,0
5570,guy bitching acted like interested buying som...,0


In [3]:
df = pd.read_csv("data/processed/processed_short.csv")
processed_short = df.dropna().reset_index(drop=True)
processed_short['text'].isna().sum()

0

In [4]:
processed_short.to_csv("data/processed/processed_short.csv", index=False)

# Medium samples preproc

In [3]:
raw_medium = pd.read_csv("data/raw/IMDB Dataset.csv")

In [4]:
raw_medium.insert( 2, "text", "Empty")
raw_medium.insert( 3, "label", "Empty")

In [14]:
for ind in raw_medium.index:
    if raw_medium["text"][ind] == "Empty":
        review = raw_medium["review"][ind]
        raw_medium["text"][ind] = preprocess(review)
    
    if raw_medium["label"][ind] == "Empty":
        if raw_medium["sentiment"][ind] == "negative":
            raw_medium["label"][ind] = 0
        elif raw_medium["sentiment"][ind] == "positive":
            raw_medium["label"][ind] = 1

In [15]:
processed_medium = raw_medium[["text","label"]]
processed_medium = processed_medium.dropna()
processed_medium.to_csv("data/raw/full_proc_imdb.csv", index=False)

Unnamed: 0,text,label
0,one reviewers mentioned watching oz episode h...,1
1,wonderful little production br br filming tec...,1
2,thought wonderful way spend time hot summer w...,1
3,basically family little boy jake thinks zombi...,0
4,petter mattei love time money visually stunni...,1
...,...,...
49995,thought movie right good job creative origina...,1
49996,bad plot bad dialogue bad acting idiotic dire...,0
49997,catholic taught parochial elementary schools ...,0
49998,going disagree previous comment side maltin o...,0


In [8]:
processed_medium = pd.read_csv("data/processed/processed_medium.csv")

In [6]:
df = processed_medium

NameError: name 'processed_medium' is not defined

In [17]:
processed_medium_1 = df.loc[df["label"]==1].sample(n=2500, random_state=42)
processed_medium_0 = df.loc[df["label"]==0].sample(n=2500, random_state=42)

processed_medium = processed_medium_1.append(processed_medium_0, ignore_index=True).reset_index(drop=True)
processed_medium


Unnamed: 0,text,label
0,know film meager rating imdb film accompanied...,1
1,long time seemed like good canadian actors he...,1
2,terry gilliam david peoples teamed create one...,1
3,say anti establishment film produced time col...,1
4,movie made years end civil war likely anticip...,1
...,...,...
4995,scientist girl friend driving speeding causes...,0
4996,setup nature beast ingeniously simple fraught...,0
4997,minor spoilers bad movie connoisseur must vie...,0
4998,evan almighty steve carell morgan freeman lau...,0


In [9]:
processed_medium['text'].isna().sum()

0

In [18]:
processed_medium.to_csv("data/processed/processed_medium.csv", index=False)

# DANK samples preproc

In [12]:
df = pd.read_csv("data/processed/processed_dank.csv")
df['text'].isna().sum()

0

# LEGACY - Long samples preproc

## For BBC articles

In [8]:
from pathlib import Path
rows_list = []
#ranges: 512 for sport, 402 for tech, 418 for politics, 511 for business
for i in range(1,418):
    txt = Path("data/raw/bbc/politics/"+str(i).zfill(3)+'.txt').read_text()
    dict1 = {}
    dict1.update({'text':preprocess(txt)})
    dict1.update({'label': 1})
    rows_list.append(dict1)

for j in range(1,511):
    txt = Path("data/raw/bbc/business/"+str(j).zfill(3)+'.txt').read_text()
    dict2 = {}
    dict2.update({'text':preprocess(txt)})
    dict2.update({'label':0})
    rows_list.append(dict2)

processed_long = pd.DataFrame(rows_list, columns=["text", "label"])

In [9]:
#df = pd.read_csv("data/processed/processed_long.csv")
processed_long=processed_long.dropna()

In [10]:
processed_long.to_csv("data/processed/processed_long_pol_v_busi.csv", index=False)

## For 20News

In [58]:
raw_20news = pd.read_csv("data/raw/20newsgroup_preprocessed.csv", delimiter=";")
raw_20news = raw_20news.drop(columns="text")

In [59]:

raw_20news = raw_20news.rename(columns={'text_cleaned': "text", "target": "label"}, inplace=False)

In [60]:
raw_20news = raw_20news[['text', 'label']]

In [61]:
raw_20news = raw_20news.dropna()

In [63]:
processed_long['label'] = raw_20news['label'].apply(labels.index)
processed_long

Unnamed: 0,text,label
0,atheist resources addresses atheist organizati...,0
1,begin pgp signed message introduction atheism ...,0
2,article charley wingate writes well john quite...,0
3,kings become philosophers philosophers become ...,0
4,article bob mcgwier writes however hate econom...,0
...,...,...
18823,paul boxrud writes wasnt sure right newsgroup ...,19
18824,article joakim ruud writes article mark wilson...,19
18825,article bill jefferys writes would like unders...,19
18826,article gerry palo danger anticult groups expo...,19


In [64]:
processed_long.to_csv("data/processed/processed_long.csv", index=False)

In [62]:
labels = raw_20news["label"].unique().tolist()
cheatcode = {}
for i in labels:
    cheatcode.update({i :labels.index(i)})
print(cheatcode)


{'alt.atheism': 0, 'comp.graphics': 1, 'comp.os.ms-windows.misc': 2, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'comp.windows.x': 5, 'misc.forsale': 6, 'rec.autos': 7, 'rec.motorcycles': 8, 'rec.sport.baseball': 9, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.electronics': 12, 'sci.med': 13, 'sci.space': 14, 'soc.religion.christian': 15, 'talk.politics.guns': 16, 'talk.politics.mideast': 17, 'talk.politics.misc': 18, 'talk.religion.misc': 19}


In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

def model_trial_sampler(df, i, j):
    name =[x for x in globals() if globals()[x] is df][0]
    #creating the desired vectors
    text_vec = TfidfVectorizer().fit_transform(df["text"])
    y = df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
    text_vec, y, test_size=0.33, random_state=42)
    #svm
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    trials.update({"(" + str(i) + ' , ' + str(j) + ")": roc_auc_score(y_test,y_pred)})

In [70]:
df_long = pd.read_csv("data/processed/processed_long.csv")
trials = {}
for i in range(20):
    for j in range(20):
        if i != j:
            sample = df_long.loc[df_long['label'].isin([i, j])]
            sample['label'] = sample['label'].map({i: 1, j: 0}).astype(int)
            sample = sample.reset_index(drop=True)
            model_trial_sampler(sample, i, j)

//pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['label'] = sample['label'].map({i: 1, j: 0}).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['label'] = sample['label'].map({i: 1, j: 0}).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['label'] = sample['label'].map({i: 1, j: 0}).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyda

In [71]:
min(trials, key=trials.get) 

'(3 , 2)'