In [231]:
##set up the environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer

#import regex
import re

In [232]:
#load Data
labeledData = pd.read_csv('../CrowdFlowerDataset/labeled_data.csv')
labeledData.head()
sarcasmData = pd.read_csv('../SarcasmDataset/sarcasm_v2.csv')
sarcasmData.head()
taggedData = pd.read_csv('tagged_data.csv')

In [233]:
labeledData["hs_score"] = labeledData["hate_speech"]/labeledData["count"]
labeledData["ol_score"] = labeledData["offensive_language"]/labeledData["count"]
labeledData["n_score"] = labeledData["neither"]/labeledData["count"]
sarcasmData["class"] = 3

In [234]:
labeledData[labeledData["hate_speech"]>0].head()
taggedData.shape

(896, 5)

In [235]:
#concat datasets
X_data = pd.concat([labeledData["tweet"],sarcasmData["Response Text"],taggedData['tweet_content']])
y_data = pd.concat([labeledData["class"],sarcasmData["class"],taggedData['class']])
X_data.size

30371

In [236]:
#preprocessing    
X_data = X_data.str.lower()  #Convert to lower case
X_data = X_data.apply((lambda x: re.sub('[!]+ rt ',' RT ', x))) 
X_data = X_data.apply((lambda x: re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',x)))     #Convert www.* or https?://* to URL
X_data = X_data.apply((lambda x: re.sub('@[^\s]+','AT_USER',x)))      #Convert @username to AT_USER
X_data = X_data.apply((lambda x: re.sub('[\s]+', ' ', x)))          #Remove additional white spaces
X_data = X_data.apply((lambda x: re.sub(r'#([^\s]+)', r'\1', x)))   #Replace #word with word
X_data = X_data.str.strip()


In [237]:
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
train_tfidf.shape
y_data.shape

(30371,)

In [238]:
#train a SGD classifier
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_data)
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X = train_tfidf
y = y_data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = SGDClassifier(loss="hinge", penalty="l2")
estimator.fit(X_train, y_train)
prediction = estimator.predict(X_test)
accuracy_score(y_test,prediction)



0.90491241933359678

In [239]:
#completely fit
estimator.fit(X,y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [240]:
import pickle
pickle.dump(estimator, open("model/sgdClassifier.p","wb"))
pickle.dump(count_vect, open("model/hurtVectorizer.p","wb"))
pickle.dump(tfidf_transformer, open("model/hurtVectorizerTfIdf.p","wb"))


In [123]:

X_train_counts = count_vect.transform(["hola"])
train_tfidf = tfidf_transformer.transform(X_train_counts)
train_tfidf
estimator.predict(train_tfidf)


array([2])

In [249]:
for x in ["hola,como,estas".split(',')]:
    print(x.__class__)

<class 'list'>


In [266]:
def inplace_change(filename, old_string, new_string):
    # Safely read the input filename using 'with'
    with open(filename) as f:
        s = f.read()
        if old_string not in s:
            return

    # Safely write the changed content, if found in the file
    with open(filename, 'w') as f:
        s = s.replace(old_string, new_string)
        f.write(s)


In [267]:
from shutil import copyfile
template = "/home/alejandra/Documents/ELK_Twitter/ELK_twitter/src/twitter-pipeline/config/user_twitter_pipeline_template.conf"
newFile = "/home/alejandra/Documents/ELK_Twitter/ELK_twitter/src/twitter-pipeline/config/user_twitter_pipeline1.conf"
copyfile(template,newFile)
inplace_change(newFile,"<user_list_to_insert>","\"" + "\",\"".join(['hola','como','estas']) + "\"")

In [269]:
a = ["0","1","2","3","4","5","6"]
a[0:4]

['0', '1', '2', '3']