# Sentiment analysis : twitter data
## Linear svm model

### first step importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### obtaining the train data

In [2]:
train_data = pd.read_csv(r"C:\Users\rahul\Desktop\notebooks\train_tweets.csv")

### processing and cleaning data

In [3]:
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)
    
import re       
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [4]:
train_data['processed_tweets'] = train_data['tweet'].apply(process_tweet)

drop_features(['id','tweet'],train_data)

In [5]:
train_data.head(10)

Unnamed: 0,label,processed_tweets
0,0,when a father is dysfunctional and is so selfi...
1,0,thanks for lyft credit i can t use cause they ...
2,0,bihday your majesty
3,0,model i love u take with u all the time in ur
4,0,factsguide society now motivation
5,0,2 2 huge fan fare and big talking before they ...
6,0,camping tomorrow danny
7,0,the next school year is the year for exams can...
8,0,we won love the land allin cavs champions clev...
9,0,welcome here i m it s so gr8


### splitting the data

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_data["processed_tweets"],train_data["label"], test_size = 0.2, random_state = 42)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [8]:
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [9]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)


### training the model

In [10]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(x_train_tfidf,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

#### predictions

In [11]:
predictions = classifier.predict(x_test_tfidf)

In [12]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.9605818864382919


In [13]:
from sklearn.metrics import confusion_matrix,f1_score
confusion_matrix(y_test,predictions)

array([[5903,   34],
       [ 218,  238]], dtype=int64)

In [14]:
f1_score(y_test,predictions)

0.6538461538461539

### obtaining test data

In [15]:
test_data = pd.read_csv('test_tweets.csv')

### processing data

In [16]:
test_data['processed_tweet'] = test_data['tweet'].apply(process_tweet)

drop_features(['tweet'],test_data)

In [17]:
train_counts = count_vect.fit_transform(train_data['processed_tweets'])
test_counts = count_vect.transform(test_data['processed_tweet'])

In [18]:
train_final = transformer.fit_transform(train_counts)
test_final = transformer.transform(test_counts)

### fitting the model

In [19]:
classifier.fit(train_final,train_data['label'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

#### predictions

In [20]:
predictions1 = classifier.predict(test_final)

### obtaining the output file

In [21]:
output_file = pd.DataFrame({'id':test_data['id'],'label':predictions1})
output_file.to_csv('output.csv',index=False)