In [2]:
# import packages
import pickle
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

### Load and Preprocess Data

In [3]:
# read data file to annotate
file = 'Felixje0.csv'
df = pd.read_csv(file)
df.shape

(3908, 3)

In [4]:
# print first couple of rows
df.head()

Unnamed: 0,Label,Username,Comment
0,,Auktarrh Engrinis,6:58 It only arrive once the galaxy and the st...
1,,Rob Byer,"Lowko, you forgot about the greatest player, t..."
2,,Nátán Kende,A game of monsters.
3,,Jakub Sobkowiak,Best StarCraft II player of all time?\nOf cour...
4,,Philip Lau,Bruh.... The SC2 GOAT is obviously Idra.\n\n\n...


In [5]:
# remove rows with missing comment or username
index = df[(df['Username'].isnull()) | (df['Comment'].isnull())].index
df = df.drop(index)

In [7]:
# print a first couple of rows and make a copy of data
print(df.head())

# also make a copy of data and initialize stemmer
c = 'Comment'
data = df.copy()
stopwords = stopwords.words('english')
stemmer = PorterStemmer()

   Label           Username                                            Comment
0    NaN  Auktarrh Engrinis  6:58 It only arrive once the galaxy and the st...
1    NaN           Rob Byer  Lowko, you forgot about the greatest player, t...
2    NaN        Nátán Kende                                A game of monsters.
3    NaN    Jakub Sobkowiak  Best StarCraft II player of all time?\nOf cour...
4    NaN         Philip Lau  Bruh.... The SC2 GOAT is obviously Idra.\n\n\n...


In [8]:
# remove special characters
data[c] = data[c].map(lambda x: re.sub(r'\W', ' ', x))

# replace multiple spaces with single space 
data[c] = data[c].map(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))

# remove all single characters
data[c] = data[c].map(lambda x:re.sub(r'\s+[a-zA-Z]\s+', ' ', x))

# covert text to lower case
data[c] = data[c].str.lower()

# tokenize text
data[c] = data[c].str.split()

# apply stemming
data[c] = data[c].map(lambda x: ' '.join([stemmer.stem(w) for w in x if w not in stopwords]))# check how many requests there are in the comments data

### Load Model and Make Predictions 

In [9]:
# load classifier
model = 'svm.pkl'
model_data = pickle.load(open(model, 'rb'))
classifier = model_data['clf']
tfidf = model_data['tfidf']

In [10]:
# generate tfidf features from comments
X = tfidf.transform(data['Comment']).toarray()

In [11]:
# predict labels from classifier
labels = classifier.predict(X)
df['Label'] = labels

In [14]:
# filter out request comments and save in a csv file
df[df['Label'] == 1].to_csv(file.replace('.csv', '_Labelled') + '.csv', index=False)