# Vectorising the Stamford Training Dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

In [2]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [3]:
from nltk.tokenize import sent_tokenize

### Read Stamford Training Data

In [4]:
stamford = pd.read_csv("stamfordtraining.csv", header=None)

In [5]:
stamford.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,0,0,"http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,0,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,0,0,I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,0,0,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,0,0,"no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [6]:
stamford[8] = stamford[7].apply(lambda x: ' '.join([word for word in x.split() if word not in (ENGLISH_STOP_WORDS)]))

In [7]:
stamford = stamford.drop(7, axis=1)

### Fit & Transform Vectorizer

In [9]:
cv = CountVectorizer(binary=True)

In [10]:
cv.fit(stamford[8])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
X = cv.transform(stamford[8])

### Train Logistic Regression Model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
target = stamford[6]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, target, test_size = 0.25, random_state=123)

In [15]:
lr = LogisticRegression(C=0.25, fit_intercept=False)

In [16]:
lr.fit(X_train, y_train)



LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
lr.predict(X_val[0:100])

array([0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0])

### Print accuracy

In [18]:
print("Score: {:.2f}".format(lr.score(X_val, y_val)))

Score: 0.78


### Apply model to new text

In [21]:
text = ["Odion Ighalo gave United the lead", "He smashed it into the net"]

print("Text prediction: {}". format(lr.predict(cv.transform(text))))

Text prediction: [1 0]


### Make a function out of model

In [24]:
def sentiment(input):
    #Tokenize the sentences
    sentences = sent_tokenize(input)
    
    #Remove Stopwords
    #no_stopwords = ' '.join([word for word in input if word not in (ENGLISH_STOP_WORDS)])
    
    # Predictions
    predictions = pd.DataFrame(lr.predict_proba(cv.transform(sentences)))
    
    # Create DataFrame
    predictions['sentences'] = sentences
    
    # Rename Cols
    predictions = predictions.rename(columns={0: "negative_prob", 1: "positive_prob"})
    
    # Declare Threshold
    threshold = 0.6
    
    # Sentiment List
    sentiment_list = []
    
    # Determine sentiment 
    for index, row in predictions.iterrows():
        if row['negative_prob'] > threshold:
            sentiment_list.append("negative")
        elif row['positive_prob'] > threshold:
            sentiment_list.append("positive")
        else:
            sentiment_list.append("neutral")
            
    predictions['sentiment'] = sentiment_list
    
    return predictions;


In [27]:
sentiment("""It was business as usual in a comfortable victory for Bayern Munich at Union Berlin, even if the low-key, low-volume return of the Bundesliga is still a highly unusual sight to behold.

Taking three points moved Germany’s champions four ahead of Borussia Dortmund, with a Der Klassiker meeting due on 26 May. Robert Lewandowski collected his customary goal with a penalty and the defender Benjamin Pavard later headed home a Joshua Kimmich corner for the second.


Fewer dives, a missing coach and subs in stand: inside Bundesliga's return
 Read more
Bayern will need to find extra gears against Dortmund, highly impressive in beating Schalke 4-0 on Saturday. Win that and an eighth successive title will be within reach. “We didn’t show our best side in terms of play,” admitted the goalkeeper Manuel Neuer afterwards. “But we dominated the game and deservedly take the three points home with us.”

Union, safely in mid-table, have performed creditably since last season’s promotion but were facing the Bundesliga’s ultimate test, and doing so without home fans among the most fervent in Germany. Berlin police had warned supporters to stay away but later reported that a few dozen had gathered outside the stadium.

Also absent was the Union coach, Urs Fischer, after he broke quarantine to return home to Switzerland following a family bereavement. The plan he had passed on to his players was clear, though: sit back, soak up pressure and see what else might arise.


Anthony Ujah, their Nigeria forward, fired wide in the early stages but Bayern soon settled into dominance of the limited opposition. They had the ball in the net in the 18th minute, only for Thomas Müller to be ruled offside after he took a final – and unnecessary – touch of a goalbound header from Serge Gnabry. The VAR decision was greeted by a throaty roar from the small home contingent permitted to be inside the tight stadium set in a forest on the fringes of Germany’s capital.

The first half-hour saw Union offer stiff resistance, reducing Lewandowski to penalty-area scraps amid some frustration for Bayern. Alphonso Davies’s high tackle on Grischa Prömel brought the Canada full-back a yellow card. Then came Neven Subotic’s flailing foul on Leon Goretzka, who had blindsided the Serb, and a chance from the spot that Lewandowski was never going to pass up.

The Polish striker scored his 40th of the season, making it the fifth season in succession he has reached that tally, with a feint, a jump and a firm shot beyond his compatriot Rafal Gikiewicz.

Bayern’s attempts on goal piled up but they struggled to reproduce the slick football and threat they had exerted in pre-lockdown performances such as their late-February 3-0 Champions League win at Chelsea.


Fewer dives, a missing coach and subs in stand: inside Bundesliga's return
 Read more
Kingsley Coman, on as a second-half substitute, howled in anguish when his driven cross failed to find any takers but Pavard’s neck muscles eventually made sure of the win 10 minutes from time.

A controlled performance with few scares suffered made it 15 wins from Bayern’s past 16 matches. The Dortmund encounter will, though, be far more indicative of whether the Bundesliga title is likely to return to its perennial destination.”""")

Unnamed: 0,negative_prob,positive_prob,sentences,sentiment
0,0.030321,0.969679,"It was business as usual in a comfortable victory for Bayern Munich at Union Berlin, even if the low-key, low-volume return of the Bundesliga is still a highly unusual sight to behold.",positive
1,0.120591,0.879409,"Taking three points moved Germany’s champions four ahead of Borussia Dortmund, with a Der Klassiker meeting due on 26 May.",positive
2,0.062247,0.937753,Robert Lewandowski collected his customary goal with a penalty and the defender Benjamin Pavard later headed home a Joshua Kimmich corner for the second.,positive
3,0.687688,0.312312,"Fewer dives, a missing coach and subs in stand: inside Bundesliga's return\n Read more\nBayern will need to find extra gears against Dortmund, highly impressive in beating Schalke 4-0 on Saturday.",negative
4,0.234896,0.765104,Win that and an eighth successive title will be within reach.,positive
5,0.480885,0.519115,"“We didn’t show our best side in terms of play,” admitted the goalkeeper Manuel Neuer afterwards.",neutral
6,0.146007,0.853993,"“But we dominated the game and deservedly take the three points home with us.”\n\nUnion, safely in mid-table, have performed creditably since last season’s promotion but were facing the Bundesliga’s ultimate test, and doing so without home fans among the most fervent in Germany.",positive
7,0.653453,0.346547,Berlin police had warned supporters to stay away but later reported that a few dozen had gathered outside the stadium.,negative
8,0.907272,0.092728,"Also absent was the Union coach, Urs Fischer, after he broke quarantine to return home to Switzerland following a family bereavement.",negative
9,0.13207,0.86793,"The plan he had passed on to his players was clear, though: sit back, soak up pressure and see what else might arise.",positive
