<h2> First Approach: TF/IDF </h2>

In [1]:
# --- Preprocessing ---
import pandas as pd
from sklearn.utils import resample

# --- Processing ---
import re
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

# --- Postprocessing ---
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [3]:
# Reading the files
train = pd.read_csv('Datasets/0train_tweets.csv',usecols=['tweet','label'])
test = pd.read_csv('Datasets/0test_tweets.csv')
pd.set_option('display.max_colwidth', -1)

print('Column names: %s'%(train.columns))
print('Train length: %s'%(len(train)))
print('Test length: %s '%(len(test)))

Column names: Index(['label', 'tweet'], dtype='object')
Train length: 31962
Test length: 17197 


In [4]:
train.tail()

Unnamed: 0,label,tweet
31957,0,ate @user isz that youuu?ðððððððððâ¤ï¸
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher
31959,0,listening to sad songs on a monday morning otw to work is sad
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act"
31961,0,thank you @user for you follow


In [5]:
# Tweets cleaning
def clean_text(df,text_field):
    df[text_field] = df[text_field].str.lower()
    mystring = r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    # the first filter is capture everything eclosed that starts with an @
    # and that has any single character from the range A-Z or a-z or 0-9 with more
    # than one element inside of that
    # the second filter is capture everything that is not a number, a lower or upper
    # case letter after the last line
    df[text_field] = df[text_field].apply(lambda element: re.sub(mystring,"",element))
    return df

train_clean = clean_text(train,'tweet')
test_clean = clean_text(test,'tweet')
train_clean.tail()

Unnamed: 0,label,tweet
31957,0,ate isz that youuu
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm shame imwithher
31959,0,listening to sad songs on a monday morning otw to work is sad
31960,1,sikh temple vandalised in in calgary wso condemns act
31961,0,thank you for you follow


In [6]:
print('Train length: %s'%(len(train)))
print('Test length: %s '%(len(test)))

Train length: 31962
Test length: 17197 


<h3>Upsampling</h3>

In [7]:
# Number of non-racist/non-sexist (0's) tweets
# in the training
train_majority = train_clean[train_clean.label==0]
print('Number of non-racist/non-sexist tweets: %s '%(len(train_majority)))

Number of non-racist/non-sexist tweets: 29720 


In [8]:
# Number of racist/sexist (1's) tweets in the training
train_minority = train_clean[train_clean.label==1]
print('Number of racist/sexist tweets: %s '%(len(train_minority)))

Number of racist/sexist tweets: 2242 


In [9]:
# Upsampling:
# We use the tags with less frequency, in this case 1's, inside
# the training in order to create an upsampled training set with
# the same dimension the tags with the highest frequency have (in
# this case 0's), allowing tweet repetition.
train_minority_upsampled = resample(train_minority, # data  set to use
                                    replace=True, # repetitions are allowed
                                    n_samples=len(train_majority), # I want 29720 samples
                                    random_state=123) # using this random state

# Now that the minority of tweets is no longer outmatched by 
# the majority, we can put them together, obtaining a training
# set of twice the size of the majority, i,.e. 2*(2970) = 59440
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
print('Upsampling')
print('Number of total tweets in training: %s,%s '%((train_upsampled.shape)))
print(train_upsampled['label'].value_counts())

Upsampling
Number of total tweets in training: 59440,2 
1    29720
0    29720
Name: label, dtype: int64


<h3>Downsampling</h3>

In [10]:
# Number of non-racist/non-sexist (0's) tweets
# in the training
train_majority = train_clean[train_clean.label==0]
print('Number of non-racist/non-sexist tweets: %s '%(len(train_majority)))

Number of non-racist/non-sexist tweets: 29720 


In [11]:
# Number of racist/sexist (1's) tweets in the training
train_minority = train_clean[train_clean.label==1]
print('Number of non-racist/non-sexist tweets: %s '%(len(train_minority)))

Number of non-racist/non-sexist tweets: 2242 


In [12]:
# Downsampling:
# We use the tags with the largest frequency, in this case the 
# 0's inside the training in order to create an downsampled training 
# set with the same dimension the tags with the lowest frequency have (in
# this case 1's), allowing tweet repetition.
train_majority_downsampled = resample(train_majority, # use the train majority
                                 replace=True,  # repetitions are allowed
                                 n_samples=len(train_minority), # I want 2242 samples
                                 random_state=123) #using this random state

# Now that the majority of tweets has been reduced to a set
# with the same size as the minority, we can put them together,
# obtaining a training set of twice the size of the minority, i,.e. 2*(2242) = 4484
train_downsampled = pd.concat([train_majority_downsampled, train_minority])
print('Downsampling')
print('Number of total tweets in training: %s,%s '%((train_downsampled.shape)))
print(train_downsampled['label'].value_counts())

Downsampling
Number of total tweets in training: 4484,2 
1    2242
0    2242
Name: label, dtype: int64


<h3> Model Training </h3>

In [13]:
# Training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],
                                                    random_state=0)

# Format: Training
#
# 1. Vectorizer 
# Here we get the list of words of the whole 
# training set, the whole tokenized version 
# of the dataset.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_train)

# 2. TF/IDF
# It indicates how many times a word appears
# inside each tweet.
tfidf_transformer = TfidfTransformer()#use_idf=False).fit(X)
X_train_tfidf = tfidf_transformer.fit_transform(X)

# The Model
clf = SGDClassifier()
model = SGDClassifier().fit(X_train_tfidf,y_train)

# Format: Testing
# 
# 1. Vectorizer 
X_prime = vectorizer.transform(X_test)
# 2. TF/IDF
X_test_tfidf = tfidf_transformer.transform(X_prime)

# Prediction
y_predict = model.predict(X_test_tfidf)
f1_score(y_test, y_predict)



0.9694666666666667

In [14]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_predict))#,target_names=twenty_test.target_names))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      7490
           1       0.95      0.99      0.97      7370

   micro avg       0.97      0.97      0.97     14860
   macro avg       0.97      0.97      0.97     14860
weighted avg       0.97      0.97      0.97     14860



<h2> Pipeline </h2>

In [None]:
# Pipelines:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('nb', SGDClassifier()),
])

<h2> Prediction (Upsampling) </h2>

In [None]:
# Test and Training Splitting
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],
                                                    random_state=0)
# Model
model_up = pipeline_sgd.fit(X_train_up, y_train_up)
model_up

y_predict_up = model_up.predict(X_test_up)
y_predict_up
print('Accuracy with upsampling: %s '%(f1_score(y_test_up, y_predict_up)))

<h2> Prediction (Downsampling) </h2>

In [None]:
# Test and Training Splitting
X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(train_downsampled['tweet'],
                                                    train_downsampled['label'],
                                                    random_state=0)
# Model
model_down = pipeline_sgd.fit(X_train_down, y_train_down)
model_down

y_predict_down = model_up.predict(X_test_down)
y_predict_down
print('Accuracy with downsampling: %s '%(f1_score(y_test_down, y_predict_down)))

<hr />

In [None]:
# # scikit-learn bootstrap
# from sklearn.utils import resample
# # data sample
# data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# print('data: %s'%(data))
# # prepare bootstrap sample
# boot = resample(data, replace=True, n_samples=2*len(data), random_state=1)
# print('Bootstrap Sample: %s' % boot)
# # out of bag observations
# oob = [x for x in data if x not in boot]
# # print('OOB Sample: %s' % oob)

# # data: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# # Bootstrap Sample: [0.6, 0.4, 0.5, 0.1, 0.2, 0.4, 0.6, 0.1, 0.1, 0.2, 0.5, 0.6]
# # OOB Sample: [0.3]

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
print(corpus)

In [None]:
myvectorizer = CountVectorizer()
myX = myvectorizer.fit_transform(corpus)
print(myvectorizer.get_feature_names())

In [None]:
print(myX)

In [None]:
# ['This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?']
# ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
# [[0 1 1 1 0 0 1 0 1]
#  [0 2 0 1 0 1 1 0 1]
#  [1 0 0 1 1 0 1 1 1]
#  [0 1 1 1 0 0 1 0 1]]

mytf_transformer = TfidfTransformer(use_idf=False).fit(myX)
print(mytf_transformer)

In [None]:
myX_train_tf = mytf_transformer.transform(myX)
print(myX_train_tf)

In [None]:
myX_train_tf.toarray()
# TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False,
#          use_idf=False)
#   (0, 1)	0.4472135954999579
#   (0, 2)	0.4472135954999579
#   (0, 3)	0.4472135954999579
#   (0, 6)	0.4472135954999579
#   (0, 8)	0.4472135954999579
#   (1, 1)	0.7071067811865475
#   (1, 3)	0.35355339059327373
#   (1, 5)	0.35355339059327373
#   (1, 6)	0.35355339059327373
#   (1, 8)	0.35355339059327373
#   (2, 0)	0.4082482904638631
#   (2, 3)	0.4082482904638631
#   (2, 4)	0.4082482904638631
#   (2, 6)	0.4082482904638631
#   (2, 7)	0.4082482904638631
#   (2, 8)	0.4082482904638631
#   (3, 1)	0.4472135954999579
#   (3, 2)	0.4472135954999579
#   (3, 3)	0.4472135954999579
#   (3, 6)	0.4472135954999579
#   (3, 8)	0.4472135954999579
# (4, 9)

# myclf = SGDClassifier()#loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)