In [2]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow
import keras
import nltk

In [3]:
train_data= pd.read_csv('drive/My Drive/Kaggle Datasets/train (1).csv')
test_data= pd.read_csv('drive/My Drive/Kaggle Datasets/test (1).csv')

In [4]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_data.drop(columns=['keyword', 'location', 'id'], inplace=True)
test_data.drop(columns=['keyword', 'location'], inplace=True)

In [6]:
def clean_text(name):
    processed = name.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                     'emailaddress')

    # Replace URLs with 'webaddress'
    processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                      'webaddress')

    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = processed.str.replace(r'£|\$', 'moneysymb')

    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                      'phonenumbr')

    # Replace numbers with 'numbr'
    processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

    # Remove punctuation
    processed = processed.str.replace(r'[^\w\d\s]', ' ')

    # Replace whitespace between terms with a single space
    processed = processed.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    processed = processed.str.replace(r'^\s+|\s+?$', '')

    # change words to lower case - Hello, HELLO, hello are all the same word
    processed = processed.str.lower()
    
    return processed

In [7]:
clean_train = clean_text(train_data["text"])
clean_test = clean_text(test_data["text"])

In [10]:
import nltk
nltk.download('stopwords')
stop_words= set(stopwords.words("english"))
clean_train = clean_train.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

clean_test = clean_test.apply(lambda x:" ".join(term for term in x.split() if term not in stop_words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
clean_train= clean_train.apply(lambda x: " ".join(ps.stem(word) for word in x.split() ))

In [12]:
clean_test = clean_test.apply(lambda x:" ".join([ps.stem(word) for word in x.split()]))

In [13]:
clean_train

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3       numbr numbr peopl receiv wildfir evacu order c...
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    aria_ahrari thetawniest control wild fire cali...
7610    mnumbr numbr numbr utc numbrkm volcano hawaii ...
7611    polic investig e bike collid car littl portug ...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object

In [15]:
import nltk
nltk.download('wordnet')

wl = WordNetLemmatizer()

clean_train = clean_train.apply(lambda x:" ".join([wl.lemmatize(word) for word in x.split()]))

clean_test = clean_test.apply(lambda x:" ".join([wl.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [16]:
clean_test

0                                happen terribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gee flee across street c...
3                          apocalyps light spokan wildfir
4                typhoon soudelor kill numbr china taiwan
                              ...                        
3258      earthquak safeti lo angel ûò safeti fasten xrwn
3259    storm ri wors last hurrican citi amp numbroth ...
3260         green line derail chicago http co utbxlcbiuy
3261    meg issu hazard weather outlook hwo http co nu...
3262      cityofcalgari activ municip emerg plan yycstorm
Name: text, Length: 3263, dtype: object

In [18]:
train_data["text"] = clean_train
test_data["text"] = clean_test

In [20]:
from sklearn.model_selection import train_test_split

seed = 42

X = train_data.text
y = train_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "Bernouli", "PassiveAggressiveClassifier",
     "Naive Bayes", "SVC"]

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(),
    MultinomialNB(),
    BernoulliNB(),
    PassiveAggressiveClassifier(max_iter=50),
    SVC(kernel="linear")
]

zipped_clf = zip(names, classifiers)
tvec = TfidfVectorizer()


In [22]:
def acc(pipeline, X_train, y_train, X_test, y_test):
    sentiment_fit = pipeline.fit(X_train, y_train)
    y_pred = sentiment_fit.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
   
    print("-"*30)
    
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    
    print("-"*30)
    
    return accuracy

In [26]:
from sklearn.pipeline import Pipeline
def compare_clf(classifier=zipped_clf, vectorizer=tvec, n_features=10000, ngram_range=(1, 1)):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n, c in classifier:
        checker_pipeline = Pipeline([
            ("vectorizer", vectorizer),
            ("classifier", c)
        ])
        clf_acc = acc(checker_pipeline, X_train, y_train, X_test, y_test)
        print("Model result for {}".format(n))
        print(c)
        result.append((n, clf_acc))
    return result

In [27]:
trigram_result = compare_clf()

------------------------------
accuracy score: 72.94%
------------------------------
Model result for Decision Tree
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')
------------------------------
accuracy score: 78.68%
------------------------------
Model result for Random Forest
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_s

In [30]:
def prediction(pipeline, testtext):
    sentiment_fit = pipeline.fit(X_train,y_train)
    y_pred = sentiment_fit.predict(testtext)
    
    return y_pred

vectorizer=TfidfVectorizer()
checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', MultinomialNB())
        ])
vectorizer.set_params(stop_words=None, max_features=100000, ngram_range=(1,4))
prediction=prediction(checker_pipeline,test_data['text'])

In [31]:
prediction

array([1, 1, 1, ..., 1, 1, 1])

In [33]:
index = test_data.id
newFrame = pd.DataFrame({"id":index, "target":prediction})
newFrame.to_csv("realnot.csv", index=False)