In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [2]:
df = pd.read_csv('D:/Python/Machine/nlp-getting-started/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [4]:
df['location'].unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [5]:
df = df.drop(['id', 'keyword', 'location'], axis = 1)

df

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
5,#RockyFire Update => California Hwy. 20 closed...,1
6,#flood #disaster Heavy rain causes flash flood...,1
7,I'm on top of the hill and I can see a fire in...,1
8,There's an emergency evacuation happening now ...,1
9,I'm afraid that the tornado is coming to our a...,1


In [6]:
X = df['text']
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25)

X_train.head()

844     SEAN END CAREER sG Blizzard vs KNOCKOUT ... ht...
4508    Be careful during hurricane season ???? https:...
5665    UD: Rescue (Structural Collapse) - Scott Road ...
6596    @RobPulseNews @huyovoeTripolye Phillips should...
2006                           #NP Metallica - Damage Inc
Name: text, dtype: object

In [7]:
tokenizer = RegexpTokenizer(r"\w+") # 仅选择字母数字字符
en_stop = set(stopwords.words('english')) # 取出所有英文非标识符的内容
ps = PorterStemmer() # 从任何给定的单词中提取词干

In [8]:
def getStemmedTweet(tweet):
    
    tweet = tweet.lower()     # 将tweet中的大写形式转为小写形式
    
    tokens = tokenizer.tokenize(tweet)
    new_tokens = [token for token in tokens if token not in en_stop]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review
rand_num = 34
print("Review ===> ", X_train[rand_num])
print("Preprocessed Review ===>", getStemmedTweet(X_train[rand_num]))

KeyError: 34

In [9]:
X_cleaned = X_train.apply(getStemmedTweet)
Xval_cleaned = X_val.apply(getStemmedTweet)

In [10]:
cv = CountVectorizer()

X_vec = cv.fit_transform(X_cleaned).toarray()
Xval_vec = cv.transform(Xval_cleaned).toarray()

print(X_vec.shape)
print(Xval_vec.shape)

(5709, 15011)
(1904, 15011)


In [11]:
mnb = MultinomialNB()
mnb.fit(X_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
mnb.score(Xval_vec, y_val)

0.7977941176470589

In [13]:
test = pd.read_csv('D:/Python/Machine/nlp-getting-started/test.csv')

# get the main 'text' column
test_ids = test['id']
test = test['text']

# apply the preprocessing pipeline
test = test.apply(getStemmedTweet)

# creating the count vectors from the dataset
test_vec = cv.transform(test).toarray()

predictions = mnb.predict(test_vec)

In [14]:
predictions = pd.Series(predictions)
ids = pd.Series(test_ids)

pred_df = pd.concat([ids, predictions], keys = ['id', 'target'], axis = 1)

pred_df.to_csv('submission.csv',index = False)