In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

train = pd.read_csv("../data/kaggle_train.csv")
test = pd.read_csv("../data/kaggle_test.csv")

print("Stats of training set: ", train.shape)
print("Stats of testing set:", test.shape)


# print("Data labels: ", labels)

print("Split the dataset up 70/30:")

X = train[:int(train.shape[0] * 0.7)]
y = train[int(train.shape[0] * 0.7)+1:]
print("Stats of X set: ", X.shape)
print("Stats of y set:", y.shape)


Stats of training set:  (159571, 8)
Stats of testing set: (153164, 2)
Split the dataset up 70/30:
Stats of X set:  (111699, 8)
Stats of y set: (47871, 8)


In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [3]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \r\n\r\n The title is fine as i...
2,00013b17ad220c46,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:
# Check for missing values
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
# Check for missing values
test.isnull().sum()

id              0
comment_text    0
dtype: int64

# Text Preprocessing (TF-IDF)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Max_features = Build a vocabulary that only consider the top max_features ordered by term frequency
# Lowercase = Set all characters to lowercase before tokenizing
# Analyzer = Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.
# Stop_words = Stop words are words like “and”, “the”, “him”, which are presumed to be uninformative in representing the content of a text
# ngram_range = (1,1) means only unigrams, (1,2) means unigrams and bigrams, (1,3) means unigrams, bigrams, and trigrams
# Further ngrams knowledge = bigrams means it will learn the occurence of every two words, trigrams would be every 3, etc.
# dtype = type of the matrix returned, default is float64

tfidf_word = TfidfVectorizer(max_features=10000, lowercase=True, analyzer="word", stop_words="english", ngram_range=(1,3), dtype=np.float32)
tfidf_char = TfidfVectorizer(max_features=10000, lowercase=True, analyzer="char", stop_words="english", ngram_range=(3,6), dtype=np.float32)

In [7]:
# Fitting TFIDF vectors to the data
train_vector_word = tfidf_word.fit_transform(X['comment_text'])
test_vector_word = tfidf_word.fit_transform(y['comment_text'])

train_vector_char = tfidf_char.fit_transform(X['comment_text'])
test_vector_char = tfidf_char.fit_transform(y['comment_text'])

In [8]:
from scipy.sparse import hstack
# Hstack is needed to compute a cross_validation_score

#X_train
train_features = X_train = hstack([train_vector_char, train_vector_word])

#X_test
test_features = X_test =  hstack([test_vector_char, test_vector_word])

# Building and training the model

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

model_results = {
    "toxic": {},
    "severe_toxic": {},
    "obscene": {},
    "threat": {},
    "insult": {},
    "identity_hate": {}
}

for label in labels:
    for c in [0.001, 0.01, 0.1, 1, 3, 5, 10, 15, 25, 100]:
        target = X[label]
        # C = inverse of regularization strength, smaller value = stronger regularization
        # Solver = algorithm to use in the optimization problem
        # Possible solvers: newton-cg, lbfgs, liblinear, sag, saga, default is lbfgs
        # For multiclass problems, we should use newton-cg, sag, saga, and lbfgs
        logisitic_regression_classifier = LogisticRegression(C=c, solver="sag")
        cross_validation_score = np.mean(cross_val_score(logisitic_regression_classifier, train_features, target, cv=5, scoring='accuracy'))

        # Fitting the model with the training features
        # train_features = x_train
        # target = y_train
        logisitic_regression_classifier.fit(X_train, target)

        #predict just gives you the class for every example
        # predict_proba gives you the probability for every class, and predict is just taking the class which maximal probability
        # predict_log_proba gives you the logarithm of the probabilities, this is often handier as probabilities can become very, very small

        model_results[label][f"cv_score, c={c}"] = cross_validation_score
        model_results[label][f"test accuracy, c={c}"] = logisitic_regression_classifier.score(X_test, y[label])
        model_results[label][f"train accuracy, c={c}"] = logisitic_regression_classifier.score(X_train, X[label])

model_results = pd.DataFrame.from_dict(model_results)
model_results.head()



Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
"cv_score, c=0.001",0.903893,0.990116,0.946965,0.996938,0.950573,0.991441
"test accuracy, c=0.001",0.904786,0.989743,0.947254,0.99718,0.950805,0.990621
"train accuracy, c=0.001",0.903893,0.990116,0.946965,0.996938,0.950573,0.991441
"cv_score, c=0.01",0.918119,0.990116,0.955031,0.996938,0.954225,0.991441
"test accuracy, c=0.01",0.904786,0.989743,0.947254,0.99718,0.950805,0.990621


In [17]:
model_results.to_csv("./model_results.csv")

# Testing Emojis

In [10]:
test = pd.read_csv("../data/twitter.csv")

test = test[['text']]
pd.set_option('display.max_colwidth', None)
test.head()

Unnamed: 0,text
0,"🕸️🖤 ""...media in vita in morte sumos!💀🖤 Buona notte, amici cari!"" 🖤💀🕸️\r\n(...where's the liver and the fava beans?)\r\n🙄\r\n#halloweencountdown 🕰️ https://t.co/GgE1uut3k6"
1,Who doesn’t like a yellow school bus 🚌🤡???? #thestruggleisreal https://t.co/99nyXPmVL1
2,Gratgson speeding \r\n\r\n🏁 @MartinsvilleSwy \r\n🏆 #NASCARPlayoffs \r\n☠#DeadOnTools250
3,Hot take: smile is too goofy to be scary💀💀💀
4,Respectfully....the heartstopper community needs to learn the difference between empathy and sympathy 💀


In [11]:
import re
import emoji
import regex

test['processed_text'] = test['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
test['processed_text'] = test['processed_text'].str.replace('[^A-Za-z0-9]', ' ', flags=re.UNICODE)
test['processed_text'] = test['processed_text'].apply(lambda row: ''.join(re.sub(r'@\w+','',row)))

test['Emojis'] = test['text'].apply(lambda row: ''.join(c for c in row if c in emoji.distinct_emoji_list(c)))
test = test.replace(r'\n',' ', regex=True) 
test.drop_duplicates()
test.head()

  test['processed_text'] = test['processed_text'].str.replace('[^A-Za-z0-9]', ' ', flags=re.UNICODE)


Unnamed: 0,text,processed_text,Emojis
0,"🕸️🖤 ""...media in vita in morte sumos!💀🖤 Buona notte, amici cari!"" 🖤💀🕸️\r (...where's the liver and the fava beans?)\r 🙄\r #halloweencountdown 🕰️ https://t.co/GgE1uut3k6",media in vita in morte sumos Buona notte amici cari where s the liver and the fava beans halloweencountdown,🕸🖤💀🖤🖤💀🕸🙄🕰
1,Who doesn’t like a yellow school bus 🚌🤡???? #thestruggleisreal https://t.co/99nyXPmVL1,Who doesn t like a yellow school bus thestruggleisreal,🚌🤡
2,Gratgson speeding \r \r 🏁 @MartinsvilleSwy \r 🏆 #NASCARPlayoffs \r ☠#DeadOnTools250,Gratgson speeding MartinsvilleSwy NASCARPlayoffs DeadOnTools250,🏁🏆☠
3,Hot take: smile is too goofy to be scary💀💀💀,Hot take smile is too goofy to be scary,💀💀💀
4,Respectfully....the heartstopper community needs to learn the difference between empathy and sympathy 💀,Respectfully the heartstopper community needs to learn the difference between empathy and sympathy,💀


In [12]:
# Fitting TFIDF vectors to the data
emoji_test_vector_word = tfidf_word.transform(test['processed_text'])

emoji_test_vector_char = tfidf_char.transform(test['processed_text'])


In [13]:
# Features
emoji_test_features = hstack([emoji_test_vector_word, emoji_test_vector_char])
emoji_test_features

<28580x20000 sparse matrix of type '<class 'numpy.float32'>'
	with 4124276 stored elements in Compressed Sparse Row format>

In [14]:
results = pd.DataFrame.from_dict({"text": test.processed_text, "emojis": test.Emojis})
results.head()

Unnamed: 0,text,emojis
0,media in vita in morte sumos Buona notte amici cari where s the liver and the fava beans halloweencountdown,🕸🖤💀🖤🖤💀🕸🙄🕰
1,Who doesn t like a yellow school bus thestruggleisreal,🚌🤡
2,Gratgson speeding MartinsvilleSwy NASCARPlayoffs DeadOnTools250,🏁🏆☠
3,Hot take smile is too goofy to be scary,💀💀💀
4,Respectfully the heartstopper community needs to learn the difference between empathy and sympathy,💀


In [15]:
for label in labels:
    target = X[label]
    # C = inverse of regularization strength, smaller value = stronger regularization
    # Solver = algorithm to use in the optimization problem
    # Possible solvers: newton-cg, lbfgs, liblinear, sag, saga, default is lbfgs
    # For multiclass problems, we should use newton-cg, sag, saga, and lbfgs
    logisitic_regression_classifier = LogisticRegression(C=0.1, solver="sag")
    cross_validation_score = np.mean(cross_val_score(logisitic_regression_classifier, train_features, target, cv=5, scoring='accuracy'))

    # Fitting the model with the training features
    # train_features = x_train
    # target = y_train
    logisitic_regression_classifier.fit(X_train, target)

    #predict just gives you the class for every example
    # predict_proba gives you the probability for every class, and predict is just taking the class which maximal probability
    # predict_log_proba gives you the logarithm of the probabilities, this is often handier as probabilities can become very, very small


    results[label] = logisitic_regression_classifier.predict_proba(emoji_test_features)[:, 1]

KeyboardInterrupt: 

In [None]:
# results.head()
results = results.drop_duplicates()

In [None]:
results.to_csv("./emoji_logistic5.csv")