In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
import nltk
from nltk import tokenize
from nltk import ngrams
nltk.download('rslp')

[nltk_data] Downloading package rslp to /home/alberto/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [51]:
train_df = pd.read_csv('train.csv')

In [52]:
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target
5722,8165,rescuers,,Last Second Ebay Bid RT? http://t.co/oEKUcq4ZL...,0
523,755,avalanche,Ireland,A little piece I wrote for the Avalanche Desig...,0
2370,3408,derail,,@EmiiliexIrwin Totally agree.She is 23 and kno...,0
5361,7650,panic,,The cool kids asked me if I wanted to hang out...,0
6802,9745,tragedy,,Robert Gagnon reviews the catastrophe of impos...,1


In [53]:
pontuacao = list()
for ponto in punctuation:
    pontuacao.append(ponto)
pontuacao[:5]

['!', '"', '#', '$', '%']

In [54]:
def cleaning_tweets(df, tweet_column):
    token_pont = tokenize.WordPunctTokenizer()

    process_word = []
    for tweet in df[tweet_column]:
        new_quote = list()
        word_quote = token_pont.tokenize(tweet)
        for word in word_quote:
            if word.lower() not in pontuacao:
                new_quote.append(word)
        process_word.append(' '.join(new_quote))
    df['treated'] = process_word
    
    stemmer = nltk.RSLPStemmer()
    process_word = []
    for tweet in df['treated']:
        new_quote = list()
        word_quote = token_pont.tokenize(tweet)
        for word in word_quote:
            new_quote.append(stemmer.stem(word))
        process_word.append(' '.join(new_quote))
    df['stemmed'] = process_word

In [55]:
def tweet_pred(dataframe, column_name, max_f):
    vect = CountVectorizer(lowercase=False, max_features=max_f)
    bag_of_words = vect.fit_transform(dataframe[column_name])
    X_train, X_test, y_train, y_test = train_test_split(bag_of_words, dataframe.target, test_size=0.33, random_state=42)

    reg = LogisticRegression()
    reg = reg.fit(X_train, y_train)
    print(f'reg accuracy: {reg.score(X_test, y_test)}')
    
    tfidf = TfidfVectorizer(ngram_range=(1,1))
    tfidf_fit = tfidf.fit_transform(train_df['stemmed'])

    X_train, X_test, y_train, y_test = train_test_split(tfidf_fit, train_df['target'], test_size=0.33, random_state=42)

    n_reg = LogisticRegression()
    n_reg = n_reg.fit(X_train, y_train)
    print(f'n_reg accuracy: {n_reg.score(X_test, y_test)}')

In [56]:
cleaning_tweets(train_df, 'text')
train_df.head()

Unnamed: 0,id,keyword,location,text,target,treated,stemmed
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,our deed are the reason of thil earthquak may ...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,forest fir ne la rong sask can
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,all resident asked to shelt in plac are being ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13 000 people receive wildfires evacuation ord...,13 000 peopl receiv wildf evacuation ord in ca...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,just got sent thil phot from ruby alask as smo...


In [57]:
test_df = pd.read_csv('test.csv')

In [58]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [59]:
cleaning_tweets(test_df, 'text')
test_df.head()

Unnamed: 0,id,keyword,location,text,treated,stemmed
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,just happened a terribl car crash
1,2,,,"Heard about #earthquake is different cities, s...",Heard about earthquake is different cities sta...,heard about earthquak is different citi stay s...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,ther is a forest fir at spot pond gees are fle...
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting Spokane wildfires,apocalyps lighting spokan wildf
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudel kill 28 in chin and taiwan


In [81]:
tfidf = TfidfVectorizer(lowercase=True, max_df=0.7)
tfidf_fit = tfidf.fit_transform(train_df['stemmed'])

X_train, X_test, y_train, y_test = train_test_split(tfidf_fit, train_df['target'], test_size=0.33, random_state=42)

n_reg = LogisticRegression()
n_reg = n_reg.fit(X_train, y_train)
print(f'n_reg accuracy: {n_reg.score(X_test, y_test)}')

n_reg accuracy: 0.8081973736569836


In [84]:
tfidf = TfidfVectorizer(lowercase=True, ngram_range=(1,1))
tfidf_fit = tfidf.fit_transform(train_df['stemmed'])

X_train, X_test, y_train, y_test = train_test_split(tfidf_fit, train_df['target'], test_size=0.33, random_state=42)

n_reg = LogisticRegression()
n_reg = n_reg.fit(X_train, y_train)
print(f'n_reg accuracy: {n_reg.score(X_test, y_test)}')

n_reg accuracy: 0.8081973736569836


In [85]:
tfidf_fit_new = tfidf.transform(test_df['stemmed'])

In [87]:
predict = n_reg.predict(tfidf_fit_new)

In [92]:
test_df = test_df.assign(target=predict)
test_df.head()

Unnamed: 0,id,keyword,location,text,treated,stemmed,target
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,just happened a terribl car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",Heard about earthquake is different cities sta...,heard about earthquak is different citi stay s...,0
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,ther is a forest fir at spot pond gees are fle...,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting Spokane wildfires,apocalyps lighting spokan wildf,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudel kill 28 in chin and taiwan,1


In [94]:
submission = test_df[['id', 'target']]
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [96]:
submission.shape

(3263, 2)

In [97]:
submission.to_csv('submission.csv', index=False)