In [67]:
import pandas as pd
import numpy as np

In [68]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [69]:
train_df['TweetText'] = train_df['TweetText'].apply(lambda x: x.replace('RT', '').replace("'RT", ''))
test_df['TweetText'] = test_df['TweetText'].apply(lambda x: x.replace('RT', '').replace("'RT", ''))

# Preprocessing

In [31]:
!pip install tweet-preprocessor

^C


In [9]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
     ------------------------------------ 289.9/289.9 kB 198.9 kB/s eta 0:00:00
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp310-cp310-win_amd64.whl (39 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [70]:
import preprocessor as p
train_df['Cleaned_Tweet'] = train_df['TweetText'].apply(lambda x: p.clean(x))
test_df['Cleaned_Tweet'] = test_df['TweetText'].apply(lambda x: p.clean(x))

In [71]:
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import contractions

def expand_contractions(text):
    return contractions.fix(text)

def remove_digits(text):
    text = re.sub(r"\d+", "", text)
    return text.lower()

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

lemmatizer = WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

def lemmatize_text(words):
    return [lemmatizer.lemmatize(w) for w in words]

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = [w for w in text if not w in stop_words]
    return words

def preprocess(text):
    text = remove_digits(text)
    text = expand_contractions(text)
    words = w_tokenizer.tokenize(text)
    words = remove_punctuation(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    return words

train_df['Cleaned_Tweet'] = train_df['Cleaned_Tweet'].apply(preprocess)
test_df['Cleaned_Tweet'] = test_df['Cleaned_Tweet'].apply(preprocess)

[nltk_data] Downloading package wordnet to C:\Users\younes
[nltk_data]     G\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\younes
[nltk_data]     G\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [72]:
train_df = train_df[train_df['Cleaned_Tweet'].str.strip().astype(bool)]
test_df = test_df[train_df['Cleaned_Tweet'].str.strip().astype(bool)]

  test_df = test_df[train_df['Cleaned_Tweet'].str.strip().astype(bool)]


In [73]:
train_df.head()

Unnamed: 0,TweetId,Label,TweetText,Cleaned_Tweet
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,"[value, measured, dollar, term, deepest, ameri..."
1,304834304222064640,Politics,'@rraina1481 I fear so',[fear]
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,"[watch, video, highlight, final, australia, we..."
3,304366580664528896,Sports,' @chelscanlan: At Nitro Circus at #AlbertPark...,"[nitro, circus]"
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,"[always, good, thing, thanks, feedback]"


# Feature extraction

In [74]:
def word_counts_in_tweet(data):
    count = [0 for i in range(data.shape[0]) ]
    for i in range(data.shape[0]) :
        count[i]=len(data['Cleaned_Tweet'][i])
    data['Word_Count'] = count

In [75]:
word_counts_in_tweet(train_df)
word_counts_in_tweet(test_df)

In [76]:
def word_count(data):
    freqs = {}
    for i in range(data.shape[0]) :
        label = data['Label'][i]
        for word in  data['Cleaned_Tweet'][i]:
            pair = ((word,label))
            if pair in freqs :
                freqs[pair] += 1
            else :
                freqs[pair] = 1
    return freqs

In [77]:
freqs = word_count(train_df)

In [78]:
def vectorize_tweet(data,freqs) :
    politics_weights = [0 for i in range(data.shape[0])]
    sports_weights = [0 for i in range(data.shape[0])]
    for i in range(data.shape[0]) :
        for word in  data['Cleaned_Tweet'][i]:
            if (word,'Politics') in freqs :
                 politics_weights[i] += freqs.get((word,'Politics'))
            if (word,'Sports') in freqs :
                sports_weights[i] += freqs.get((word,'Sports'))
    data['politic_Weight'] = politics_weights
    data['sport_Weight'] = sports_weights 
    data.drop(columns=['Cleaned_Tweet'],inplace=True)

In [79]:
vectorize_tweet(train_df,freqs)
vectorize_tweet(test_df,freqs)

In [80]:
train_df.head()

Unnamed: 0,TweetId,Label,TweetText,Word_Count,politic_Weight,sport_Weight
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,7,113,4
1,304834304222064640,Politics,'@rraina1481 I fear so',1,6,2
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,7,269,430
3,304366580664528896,Sports,' @chelscanlan: At Nitro Circus at #AlbertPark...,2,0,4
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,5,99,254


# Model Training

In [81]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score

In [82]:
train_df['Label'] = train_df['Label'].map({'Sports':0,'Politics':1})

In [83]:
Y = train_df['Label']
X = train_df.drop(columns=['TweetId','Label','TweetText'])

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [85]:
algos = [
    RandomForestClassifier(random_state=52),
    GaussianNB(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(),
    SVC(probability=True)
]

df = {"Model": [], "Roc_Auc Score": [], "Score":[],"Cross validation score":[]}
for algo in algos:
    model = algo.fit(X_train, y_train)
    y_pred = algo.predict_proba(X_test)[:, 1]
    df["Model"].append(str(algo)[:str(algo).find('(')])
    df["Roc_Auc Score"].append(roc_auc_score(y_test, y_pred))
    df['Score'].append(algo.score(X_test, y_test))
    df['Cross validation score'].append(cross_val_score(algo, X, Y, cv=5).mean())

models = pd.DataFrame(df)

In [86]:
models

Unnamed: 0,Model,Roc_Auc Score,Score,Cross validation score
0,RandomForestClassifier,0.951851,0.881001,0.884444
1,GaussianNB,0.89958,0.806946,0.810268
2,DecisionTreeClassifier,0.855684,0.849847,0.848429
3,KNeighborsClassifier,0.935492,0.875894,0.878008
4,LogisticRegression,0.951552,0.891216,0.885057
5,SVC,0.950989,0.881512,0.879234


In [89]:
from sklearn.metrics import confusion_matrix
evaluation = {"Model":[],"Accuracy":[],"Precision":[]}
for algo in algos:
    algo.fit(X_train, y_train)
    y = algo.predict_proba(X_test)
    y_pred = []
    for l in y:
        if l[0] > l[1]:
            y_pred.append(0)
        else:
            y_pred.append(1)
    tn, fp, fn, tp =confusion_matrix(y_test,y_pred).ravel()
    accuracy = (tp+tn)/(tn+tp+fn+fp)
    precision = tp / (tp+fp)
    evaluation["Model"].append(str(algo)[:str(algo).find('(')])
    evaluation["Accuracy"].append(accuracy)
    evaluation["Precision"].append(precision)
algos_evaluation = pd.DataFrame(evaluation)

In [90]:
algos_evaluation

Unnamed: 0,Model,Accuracy,Precision
0,RandomForestClassifier,0.88049,0.903122
1,GaussianNB,0.806946,0.912688
2,DecisionTreeClassifier,0.850358,0.854938
3,KNeighborsClassifier,0.875894,0.889474
4,LogisticRegression,0.891216,0.926829
5,SVC,0.885087,0.908405


# Hyper-parametre Tuning

In [91]:
from sklearn.model_selection import GridSearchCV

In [92]:
param_grid = {
    'C':[0.011,0.012,0.013,0.014,0.015,0.016,0.017]
}

In [93]:
lr = LogisticRegression()

# Instantiate GridSearchCV object
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, verbose=2,n_jobs=-1)

# Fit GridSearchCV object to training data
grid_search.fit(X_train, y_train)

#Print best hyperparameters and corresponding accuracy score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best hyperparameters:  {'C': 0.017}
Best accuracy score:  0.8806608171296684


# Performance

In [94]:
classifier = LogisticRegression(C = 0.017)
classifier.fit(X,Y)
y_pred = classifier.predict(test_df.drop(columns=['TweetId','TweetText'],inplace=False))

In [95]:
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [96]:
submit = pd.DataFrame()
submit['TweetId']=test_df['TweetId']
label = ['Sports' if  y_pred[i]==0 else 'Politics' for i in range(len(y_pred))]
submit['Label'] = label
submit

Unnamed: 0,TweetId,Label
0,306486520121012224,Sports
1,286353402605228032,Sports
2,289531046037438464,Politics
3,306451661403062273,Politics
4,297941800658812928,Sports
...,...,...
2605,282023761044189184,Sports
2606,303879735006601216,Sports
2607,297956846046703616,Sports
2608,304265049537658880,Sports


In [27]:
submit.to_csv('submission.csv',index=False)

My approach to improve the current solution is cleaning the data in a efficient way and minimizing the number of features using PCA also we can use RNN (lstm).

Released By GUENDOUL Younes

team name in the competition : PGX-DS-T16161