In [114]:
import pandas as pd
df = pd.read_csv('Sentiment_Stock_data.csv')
df.head()

Unnamed: 0,Sentiment,Sentence
0,0,"According to Gran , the company has no plans t..."
1,1,"For the last quarter of 2010 , Componenta 's n..."
2,1,"In the third quarter of 2010 , net sales incre..."
3,1,Operating profit rose to EUR 13.1 mn from EUR ...
4,1,"Operating profit totalled EUR 21.1 mn , up fro..."


In [115]:
len(df)

108751

## Check for missing values:


In [116]:
# Check for the existence of NaN values in a cell:
df.isnull().sum()

Sentiment    0
Sentence     1
dtype: int64

In [117]:
df.dropna(inplace=True)

len(df)

108750

### Detect & remove empty strings


In [118]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [119]:
len(df)

108750

### Take a quick look at the `label` column:

In [120]:
df['Sentiment'].value_counts()

1    55724
0    53026
Name: Sentiment, dtype: int64

## Split the data into train & test sets:

In [121]:
X = df['Sentence']
y = df['Sentiment']

In [122]:
# Removing punctuations
X.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

for index in X:
    X=X.str.lower()
X.head(1)

0    according to gran   the company has no plans t...
Name: Sentence, dtype: object

In [123]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Build pipelines 
In this step we will create a pipeline to vectorize the data, then train and fit a model.

In [124]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

### Train Naïve Bayes model

In [125]:
text_clf_nb.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [126]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)
print(predictions)

[0 0 1 ... 0 0 1]


In [127]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[3157 4789]
 [2792 5575]]


In [128]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.53      0.40      0.45      7946
           1       0.54      0.67      0.60      8367

    accuracy                           0.54     16313
   macro avg       0.53      0.53      0.52     16313
weighted avg       0.53      0.54      0.53     16313



In [129]:

# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.5352786121498192


## Linear SVC model

In [130]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [131]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

In [132]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[3990 3956]
 [3673 4694]]


In [133]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.52      0.50      0.51      7946
           1       0.54      0.56      0.55      8367

    accuracy                           0.53     16313
   macro avg       0.53      0.53      0.53     16313
weighted avg       0.53      0.53      0.53     16313



In [134]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.5323361736038742


### Scikit-learn's built-in list stopwords:

In [151]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'among', 'show', 'fire', 'from', 'almost', 'inc', 'per', 'towards', 'latter', 'he', 'have', 'whereas', 'eleven', 'front', 'forty', 'meanwhile', 'i', 'an', 'any', 'thence', 'has', 'about', 'seem', 'every', 'now', 'ltd', 'done', 'became', 'own', 'thru', 'part', 'always', 'thin', 'the', 'side', 'not', 'which', 'her', 'give', 'go', 'but', 'thereby', 'afterwards', 'very', 'wherever', 'she', 'mine', 'myself', 'or', 'seeming', 'six', 'across', 'himself', 'somewhere', 'enough', 'also', 'whence', 'sometimes', 'hasnt', 'on', 'nowhere', 'full', 'before', 'sincere', 'its', 'must', 'nobody', 'wherein', 'nine', 'being', 'move', 'herself', 'namely', 'above', 'if', 'becoming', 'beforehand', 'nothing', 'yourselves', 'only', 'within', 'over', 'both', 'perhaps', 'him', 'everywhere', 'sometime', 'take', 'everyone', 'his', 'is', 'elsewhere', 'below', 'everything', 'whose', 'cant', 'well', 'hundred', 'someone', 'whereupon', 'that', 'amoungst', 'two', 'sixty', 'those', 'will', 'with', 'this', 'the

In [152]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [153]:
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
text_clf_nb2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_nb2.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['a', 'about', 'an', 'and', 'are',
                                             'as', 'at', 'be', 'been', 'but',
                                             'by', 'can', 'even', 'ever', 'for',
                                             'from', 'get', 'had', 'has',
                                             'have', 'he', 'her', 'hers', 'his',
                                             'how', 'i', 'if', 'in', 'into',
                                             'is', ...])),
                ('clf', LinearSVC())])

In [154]:
predictions = text_clf_nb2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[3999 3947]
 [3672 4695]]


In [155]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.52      0.50      0.51      7946
           1       0.54      0.56      0.55      8367

    accuracy                           0.53     16313
   macro avg       0.53      0.53      0.53     16313
weighted avg       0.53      0.53      0.53     16313



In [156]:
print(metrics.accuracy_score(y_test,predictions))

0.5329491816342794


## Feed new data into a trained model
Once we've developed a fairly accurate model, it's time to feed new data through it. In this last section we'll write our own news, and see how accurately our model assigns a "positive" or "negative" label to it.

### feed new data to the model's `predict()` method

In [142]:
myreview = "Wall Street closed modestly lower on Monday, adding to last week's sharp losses on nagging concerns about the Federal Reserve's determination to aggressively hike interest rates to fight inflation even as the economy slows."

In [143]:
print(text_clf_lsvc.predict([myreview]))  # be sure to put "myreview" inside square brackets

[1]


In [144]:
myreview = "European markets were lower on Monday after U.S. Federal Reserve Chair Jerome Powell signaled higher interest rates would likely persist in a bid to tame soaring inflation. "

In [167]:
a=text_clf_nb.predict([myreview])

In [169]:
a

array([1], dtype=int64)

In [172]:
a[0]

1

## Save the model 

In [177]:
text_clf_nb.fit(X, y)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [178]:
import joblib 
joblib.dump(text_clf_nb,"Naive Bayes_model")
#text_clf_nb = joblib.load("1modelEUR.bin")

['Full_Data1_modelEUR']