In [1]:
import pandas as pd
import numpy as np


In [2]:
train=pd.read_csv('train_2kmZucJ.csv')
test=pd.read_csv('test_oJQbWVk.csv')

In [3]:
print(train.info())
print(train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
id       7920 non-null int64
label    7920 non-null int64
tweet    7920 non-null object
dtypes: int64(2), object(1)
memory usage: 185.7+ KB
None
   id  label                                              tweet
0   1      0  #fingerprint #Pregnancy Test https://goo.gl/h1...
1   2      0  Finally a transparant silicon case ^^ Thanks t...
2   3      0  We love this! Would you go? #talk #makememorie...
3   4      0  I'm wired I know I'm George I was made that wa...
4   5      1  What amazing service! Apple won't even talk to...


In [4]:
print(test.info())
print(test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 2 columns):
id       1953 non-null int64
tweet    1953 non-null object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB
None
     id                                              tweet
0  7921  I hate the new #iphone upgrade. Won't let me d...
1  7922  currently shitting my fucking pants. #apple #i...
2  7923  I'd like to puts some CD-ROMS on my iPad, is t...
3  7924  My ipod is officially dead. I lost all my pict...
4  7925  Been fighting iTunes all night! I only want th...


In [5]:
tweet_train=train.iloc[:,2]
tweet_test=test.iloc[:,1]

In [6]:
label=train.iloc[:,1]

## 2. Preprocessing the train & test data

In [7]:
# Removing urls,numbers,symbols
# Removing the urls
tweet_train=tweet_train.str.replace(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',' ')
tweet_test=tweet_test.str.replace(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',' ')


In [8]:
print(tweet_train[:5])
print(tweet_test[:5])

0    #fingerprint #Pregnancy Test   #android #apps ...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object
0    I hate the new #iphone upgrade. Won't let me d...
1    currently shitting my fucking pants. #apple #i...
2    I'd like to puts some CD-ROMS on my iPad, is t...
3    My ipod is officially dead. I lost all my pict...
4    Been fighting iTunes all night! I only want th...
Name: tweet, dtype: object


In [9]:
# Removing the Symbols
tweet_train=tweet_train.str.replace(r'[^\w]',' ')
tweet_test=tweet_test.str.replace(r'[^\w]',' ')

In [10]:
print(tweet_train[:5])
print(tweet_test[:5])

0     fingerprint  Pregnancy Test    android  apps ...
1    Finally a transparant silicon case    Thanks t...
2    We love this  Would you go   talk  makememorie...
3    I m wired I know I m George I was made that wa...
4    What amazing service  Apple won t even talk to...
Name: tweet, dtype: object
0    I hate the new  iphone upgrade  Won t let me d...
1    currently shitting my fucking pants   apple  i...
2    I d like to puts some CD ROMS on my iPad  is t...
3    My ipod is officially dead  I lost all my pict...
4    Been fighting iTunes all night  I only want th...
Name: tweet, dtype: object


In [11]:
# Removing the Numbers
tweet_train=tweet_train.str.replace(r'[0-9]+',' ')
tweet_test=tweet_test.str.replace(r'[0-9]+',' ')

In [12]:
print(tweet_train[:5])
print(tweet_test[:5])

0     fingerprint  Pregnancy Test    android  apps ...
1    Finally a transparant silicon case    Thanks t...
2    We love this  Would you go   talk  makememorie...
3    I m wired I know I m George I was made that wa...
4    What amazing service  Apple won t even talk to...
Name: tweet, dtype: object
0    I hate the new  iphone upgrade  Won t let me d...
1    currently shitting my fucking pants   apple  i...
2    I d like to puts some CD ROMS on my iPad  is t...
3    My ipod is officially dead  I lost all my pict...
4    Been fighting iTunes all night  I only want th...
Name: tweet, dtype: object


In [13]:
# Turning them into lower case
tweet_train=tweet_train.str.lower()
tweet_test=tweet_test.str.lower()

In [14]:
print(tweet_train[:5])
print(tweet_test[:5])

0     fingerprint  pregnancy test    android  apps ...
1    finally a transparant silicon case    thanks t...
2    we love this  would you go   talk  makememorie...
3    i m wired i know i m george i was made that wa...
4    what amazing service  apple won t even talk to...
Name: tweet, dtype: object
0    i hate the new  iphone upgrade  won t let me d...
1    currently shitting my fucking pants   apple  i...
2    i d like to puts some cd roms on my ipad  is t...
3    my ipod is officially dead  i lost all my pict...
4    been fighting itunes all night  i only want th...
Name: tweet, dtype: object


In [15]:
# Removing Whitespace
# Replace whitespace between terms with a single space
tweet_train = tweet_train.str.replace(r'\s+', ' ')
tweet_test = tweet_test.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
tweet_train = tweet_train.str.replace(r'^\s+|\s+?$', '')
tweet_test = tweet_test.str.replace(r'^\s+|\s+?$', '')

In [16]:
print(tweet_train[:5])
print(tweet_test[:5])

0    fingerprint pregnancy test android apps beauti...
1    finally a transparant silicon case thanks to m...
2    we love this would you go talk makememories un...
3    i m wired i know i m george i was made that wa...
4    what amazing service apple won t even talk to ...
Name: tweet, dtype: object
0    i hate the new iphone upgrade won t let me dow...
1    currently shitting my fucking pants apple imac...
2    i d like to puts some cd roms on my ipad is th...
3    my ipod is officially dead i lost all my pictu...
4    been fighting itunes all night i only want the...
Name: tweet, dtype: object


In [17]:
# Removing the Stop Words
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

tweet_train = tweet_train.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [18]:
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

tweet_test = tweet_test.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [19]:
# Stemming the Words using Porter Stemmer
import nltk
from nltk.stem.porter import PorterStemmer
ps=nltk.PorterStemmer()

tweet_train=tweet_train.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))
tweet_test=tweet_test.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))


In [20]:
print(tweet_train.shape)
print(tweet_test.shape)

(7920,)
(1953,)


In [21]:
from nltk.tokenize import word_tokenize

# Creating a Bag of Words Model
all_words=[]

for text in tweet_train:
    words=word_tokenize(text)
    for w in words:
        all_words.append(w)
        
for text in tweet_test:
    words=word_tokenize(text)
    for w in words:
        all_words.append(w)

all_words=nltk.FreqDist(all_words)

In [22]:
print(len(all_words))

17859


In [23]:
word_features=list(all_words.keys())

In [24]:
def find_features(text):
    words=word_tokenize(text)
    features={}
    for word in word_features:
        features[word]=(word in words)
        
    return features

features=find_features(tweet_train[0])

for key,values in features.items():
    if values==True:
        print(key)

fingerprint
pregnanc
test
android
app
beauti
cute
health
iger
iphoneonli
iphonesia
iphon


In [25]:
featuresset_train=[(find_features(tweet)) for (tweet) in tweet_train]
featuresset_test=[(find_features(tweet)) for (tweet) in tweet_test]

In [26]:
featureset=list(zip(featuresset_train,label))

In [29]:
from sklearn.model_selection import train_test_split
training,validation=train_test_split(featureset,test_size=0.1,random_state=365)

## 3. Building the Model and Fitting the Data

In [30]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, validation)*100
print("SVC Accuracy: {}".format(accuracy))

MemoryError: 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

names=['KNeighbors Classifier','DecisionTree Classifier','RandomForest Classifier','Logistic Regression','SGD Classifier','Multinomial NB','SVC']
classifier=[KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),SVC(kernel='linear')]

models = zip(names, classifier)

for name,classifier in models:
    nltk_model=SklearnClassifier(classifier)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,validation)*100
    print('{} Accuracy : {}'.format(name,accuracy))

In [None]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = [ "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifier = [
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')]

models = list(zip(names, classifier))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, validation)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

In [None]:
prediction=nltk_ensemble.classify_many(featuresset_test)

In [None]:
prediction

In [None]:
test_ID=test.iloc[:,0]
print(test_ID)

In [None]:
submission_1=pd.DataFrame(test_ID,columns=['id'])

In [None]:
print(submission_1.head())

In [None]:
submission_1['label']=prediction

In [None]:
submission_1

In [None]:
submission_1.to_csv('Submission.csv')