# Code

In [23]:
import csv
import re
import random
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# define functions
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

# remove url's, hashtags, and @
def cleaner(corpus):
    for i in range(len(corpus)):
        # corpus[i] = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', corpus[i])
        corpus[i] = re.sub(r'http\S+', '', corpus[i])
        corpus[i] = corpus[i].replace('@', '')
        corpus[i] = corpus[i].replace('#', '')
        corpus[i] = corpus[i].replace('RT', '')
        corpus[i] = corpus[i].replace('amp', '')
    return corpus

Warning, explicit tweets below....

In [26]:
# read in data

with open('/home/b/Documents/humanRights/human_rights_training_sample_8-18-15.csv') as csvfile: 
    reader = csv.reader(csvfile)
    next(reader, None) # skip headers
    hr = [row[1] for row in reader]

hr = cleaner(hr)
for i in range(10):
    print(random.choice(hr))

with open('/home/b/Documents/humanRights/non_hr_training_sample_8-21-15.csv') as csvfile:
    reader = csv.reader(csvfile)
    non_hr = [row[11] for row in reader]

non_hr = cleaner(non_hr)
for i in range(10):
    print(random.choice(non_hr))

US: Lockett’s suffering during execution on 29/4 may amount to cruel, inhuman &; degrading treatment: 
Argentina the 17th State to ratify OP3CRC &; give childen accesstojustice! UN Experts &; NGOs call on gvts to follow 
AlbertWoodfox is still behind bars. Tell BobbyJindal: Free Albert NOW!  stopsolitary 
RanaPlaza compensation fund pays first installments to 1,587 beneficiaries.  via cleanclothes Bangladesh
 MontseCarboni: Ghana states that immunity for Heads of States may be a potential risk for peace and stability ASP12 icc_cpi fidh_en
The True Cost of Industrialized Food:  foodsecurity
.openDemocracy: Indigenous activists use int'l humanrights to advance climatejustice  EarthDay PFII14 UNPFII
A political consultant said oil &; gas officials need to exploit emotions and turn them against environmental groups

Today UNrightswire celebrates World Hospice Day. We care, do you?  WHPCday 
Lunchtime reads via HuffPostWorld: What Has Gone Right in Afghanistan:

Needa go home but I'm tooo l

In [24]:
# transform data

all_tweets = hr + non_hr
y = np.concatenate([np.ones(len(hr)), np.zeros(len(non_hr))])

c_vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english', strip_accents = 'unicode')

counts = c_vectorizer.fit_transform(all_tweets)

In [30]:
# chi-square feature selection

chi2score = chi2(counts, y)[0]
wscores = zip(c_vectorizer.get_feature_names(), chi2score)
wchi2 = sorted(wscores, key = lambda x:x[1])
top_wchi2 = wchi2[-1000:]
words = [x[0] for x in top_wchi2]
print(top_wchi2[-20:])

[('support', 296.7305649331779), ('shit', 303.46970219506045), ('na', 309.91068568228184), ('justic', 313.42718268508224), ('protect', 334.89097515308072), ('got', 354.54129493469424), ('report', 359.52831633188515), ('fuck', 368.1405942889794), ('job', 369.23647638332687), ('lol', 376.62896692998879), ('just', 404.48669713049935), ('women', 408.60484362918498), ('dont', 416.67538889851954), ('love', 429.85859617526933), ('tortur', 437.02168682624398), ('like', 444.20975216174133), ('right', 605.90052928594571), ('humanright', 632.17347159587348), ('human', 658.6534018392756), ('im', 874.84224077504177)]


In [33]:
# training
u_vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english', strip_accents = 'unicode', vocabulary = words)
counts_new = u_vectorizer.fit_transform(all_tweets)
X_train, X_test, y_train, y_test = train_test_split(counts_new, y, test_size = 0.5)
logit = LogisticRegression()
logit.fit(X_train, y_train)
predictions = logit.predict(X_test)
acc_test = sum(y_test == predictions)/len(y_test)
print('Accuracy:', acc_test)

Accuracy: 0.931496077392


Note that the classifier performs well within sample

In [34]:
# labeling unclassified data

with open('/home/b/Documents/humanRights/first200k.csv') as csvfile: 
    reader = csv.reader(csvfile)
    next(reader, None) # skip headers
    unclass = [row[11] for row in reader]

u_vectorizer = CountVectorizer(tokenizer = tokenize, stop_words = 'english', strip_accents = 'unicode', vocabulary = words)
X_un = u_vectorizer.fit_transform(unclass)

un_pred = logit.predict(X_un)
un_prob = logit.predict_proba(X_un)

un_tweets = zip(unclass, un_prob)
un_tweets = sorted(un_tweets, key = lambda x:x[1][0])

# Results

The below is the 100 highest scoring tweets. If we can get rid of Spanish tweets, then it does seem to perform fairly well. Much better than our previous machine learning attempts.

In [41]:
tweets_only = [x[0] for x in un_tweets]
print('\n\n'.join(tweets_only[:100]))

No pueden fallar en lo basico😡 lester bolon, en rally, la fanaticada metia en el juego y te sorprenden en pickoff? -.-

Me escape de la rutina, para pilotear mi viaje Por que el cubo en el que vivía se convirtió en paisaje

La renta, el sueldo, el trabajo en la oficina Lo cambie por las estrellas y por huertos de harina

DEJEN DE CREER EN LOS QUE SE DISFRAZAN DE OPOSICIÒN PERO NO LUCHAN CONTRA EL REGIMEN. SOLO LE HACEN EL JUEGO Y GANAN TIEMPO PARA ELECCIONES.

Happy to see oil dropping. Eat that Putin. All of our enemies are struggling due to low oil prices. Venezuela, Iran isis. #oil #stocks

Aja y @RevistaSemana q es como Dios q esta en todas partes y todo lo sabe (lo que le conviene al TIO) y de esto que? http://t.co/K5hC4nDBHS

Editando la historia del primer caso diagnosticado de #Ebola en los #USA. En la ciudad de #Dallas. http://t.co/wvYyPv36YU

mi madre haya esta en el rancho disculpe si no le dicho que estoy muy agradecido por todo lo que me a dado

Dios le tiene su destino a 