In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [196]:
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from string import punctuation 

def form_sentence(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

def no_user_alpha(tweet):
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','url'])
    tweet_list = [ele for ele in tweet.split() if ele != 'AT_USER']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in _stopwords]
    #clean_s = ' '.join(clean_mess)
    return clean_mess

def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    norm_tweet = ' '.join(normalized_tweet)
    return norm_tweet #normalized_tweet
    
def preprocessTweets(tweetlist):
    processedTweets=[]
    
    for tweet in tweetlist:
        tweet = tweet.lower() # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
        tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
        tweet = form_sentence(tweet)
        tweet = no_user_alpha(tweet)
        tweet = normalization(tweet)
        processedTweets.append(tweet)
    print(processedTweets[0])
    return processedTweets

In [200]:
df = pd.read_csv("./train.csv")
train_data = df['text'].to_numpy()
train_label = df['labels'].to_numpy()
#X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.1, random_state=42)
#X_train = preprocessTweets(X_train)
#X_test = preprocessTweets(X_test)
X_train = preprocessTweets(train_data)
y_train = train_label

one worst time american cause serious damage country sure wish happy father day everyone know terrible father derangeddonald trumpisatraitor trumpisacriminal


In [207]:
Train_X_Tfidf = []
count = 0
num_view_classfier = 10
tfidf_vectorizer = [TfidfVectorizer(use_idf=True, ngram_range=(i+1,i+1)) for i in range(num_view_classfier)]
for i in range(num_view_classfier):
    fit_data=tfidf_vectorizer[i].fit(X_train)#_transform(Train_X)
    Train_X_Tfidf.append(fit_data.transform(X_train))
    count+= np.shape(Train_X_Tfidf[i])[1]

In [208]:
view_svm = [svm.SVC(C=22, probability=True) for i in range(num_view_classfier)] #kernel='linear', gamma = 0.001,
decision_scores = []
for i in range(num_view_classfier):
    view_svm[i].fit(Train_X_Tfidf[i],y_train)
    decision_scores.append(view_svm[i].decision_function(Train_X_Tfidf[i]))
    decision_scores.append(1-decision_scores[2*i])

In [209]:
train_meta = np.transpose(decision_scores)
meta_svm = svm.SVC(C=10) #kernel='linear'
meta_svm.fit(train_meta, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [210]:
test_df = pd.read_csv('test.csv')
test_data = test_df['text']
X_test = preprocessTweets(test_data)
Test_X_Tfidf = []
for i in range(num_view_classfier):
    Test_X_Tfidf.append(tfidf_vectorizer[i].transform(X_test))

assange rapist


In [211]:
test_decision = []
for i in range(num_view_classfier):
    test_decision.append(view_svm[i].decision_function(Test_X_Tfidf[i]))
    test_decision.append(1-test_decision[2*i])

In [212]:
test_meta = np.transpose(test_decision)
#acc= meta_svm.score(test_meta, y_test)
#f1 = f1_score(y_test, meta_svm.predict(test_meta))
#print(acc, f1)
prediction = meta_svm.predict(test_meta)
print(prediction)
df_pred = pd.DataFrame(data={"labels": prediction})
df_pred.to_csv("./submission.csv", sep=',',index=False)

[1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0
 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 1
 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1
 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [61]:
from sklearn.linear_model import LogisticRegression

X_train = preprocessTweets(train_data)
y_train = train_label
tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,20))
Train_X_Tfidf=tfidf_vectorizer.fit_transform(X_train)#_transform(Train_X)

lreg = LogisticRegression()
lreg.fit(Train_X_Tfidf, y_train) # training the model

test_df = pd.read_csv('test.csv')
test_data = test_df['text']
X_test = preprocessTweets(test_data)
Test_X_Tfidf = tfidf_vectorizer.transform(X_test)
prediction = lreg.predict_proba(Test_X_Tfidf) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.5 # if prediction is greater than or equal to 0.5 than 1 else 0
prediction_int = prediction_int.astype(np.int)

df_pred = pd.DataFrame(data={"labels": prediction_int})
df_pred.to_csv("./submission.csv", sep=',',index=False)

#acc = accuracy_score(y_test, prediction_int)
#f1 = f1_score(y_test, prediction_int) # calculating f1 score
#print(acc, f1)

one worst time american cause serious damage country sure wish happy father day everyone know terrible father derangeddonald trumpisatraitor trumpisacriminal
assange rapist


In [24]:
from sklearn.ensemble import RandomForestClassifier

RFclf = RandomForestClassifier(n_jobs=2, random_state=42)
RFclf.fit(Train_X_Tfidf, y_train)

prediction = RFclf.predict(Test_X_Tfidf)
import csv

df_pred = pd.DataFrame(data={"labels": prediction_int})
df_pred.to_csv("./submissio.csv", sep=',',index=False)

#correct += np.sum(prediction == y_test)
acc = accuracy_score(y_test, prediction)
f1 = f1_score(y_test, prediction) # calculating f1 score
print(acc, f1)

0.65 0.7549844926894108


In [28]:
import nltk

def buildVocabulary(preprocessedTrainingData, label):
    all_words = []
    
    for (words, sentiment) in zip(preprocessedTrainingData, label):
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features

def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in tweet_words)
    return features 

word_features = buildVocabulary(X_train, y_train)
trainingFeatures = nltk.classify.apply_features(extract_features, X_train)

NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

NBResultLabels = [NBayesClassifier.classify(extract_features(tweet)) for tweet in X_test]

# get the majority vote
if NBResultLabels.count('positive') > NBResultLabels.count('negative'):
    print("Overall Positive Sentiment")
    print("Positive Sentiment Percentage = " + str(100*NBResultLabels.count('positive')/len(NBResultLabels)) + "%")
else: 
    print("Overall Negative Sentiment")
    print("Negative Sentiment Percentage = " + str(100*NBResultLabels.count('negative')/len(NBResultLabels)) + "%")

KeyboardInterrupt: 