In [1]:
import numpy as np # 
import pandas as pd # 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
# load data
tech_df = pd.read_json('resources/technology_labeled_data.json', orient='index')
tech_df.head()

Unnamed: 0,label,short_description
10206,Positive,In 2016 LinkNYC began deploying free public Wi-Fi kiosks throughout the city. The kiosks made news when people began using
1028,Negative,The first-of-its-kind accident killed a pedestrian in Arizona earlier this year.
10426,Positive,"If you’re still looking for gift inspiration, these 12 days have got you covered"
10616,Positive,The former employee who had shut down The Donald for 11 minutes says he loves Twitter and America.
10711,Negative,And it's working with other companies to scrub the internet of recruitment propaganda.


In [3]:
tech_df['label'].value_counts()

Positive    900
Neutral     698
Negative    484
Name: label, dtype: int64

In [4]:
tech_df.shape

(2082, 2)

In [5]:
# View data information
tech_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2082 entries, 10206 to 9823
Data columns (total 2 columns):
label                2082 non-null object
short_description    2082 non-null object
dtypes: object(2)
memory usage: 48.8+ KB


In [6]:
tech_df.tail(20)

Unnamed: 0,label,short_description
85805,Positive,"For instance, driving for Lyft or Uber can provide significant supplemental wages or be a temporary gig with a great deal"
85811,Positive,"Connecting billions of devices, machines, sensors and appliances to the Internet could generate tremendous economic value"
85995,Positive,It doesn't take a cyber-security expect to know that you shouldn't share your email password or Social Security number on
86038,Negative,"Protecting the privacy of citizens from those who would do them harm or steal from them is now intrinsically bound to encrypting devices, communications and data.\n\nThat's true whether for cellphones, email, health records, tax transcripts or the personnel files of tens of millions of public servants."
86055,Neutral,The story behind a hashtag that helped transform the nation's opinion at hyper-speed.
86093,Negative,"On Thursday, Facebook announced a change to its News Feed: Instead of passively following the glut of content selected by"
86165,Positive,"On Tuesday, the Department of Homeland Security showed how to deploy government technology poorly. Now, the Federal Election"
86177,Neutral,"Based upon your experiences to date, what advice would you give to other people in the tech industry considering public service"
8623,Neutral,"Remembering Equifax, Uber and all the other data breaches of the last year."
86273,Neutral,I've asked DHS all of these questions. If I receive a response to them or to the FOIA request that may or may not have been


# install spacy

!pip install spacy
!python -m spacy download en

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [8]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]

    # Removing stop words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

In [9]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [10]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [11]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [12]:
from sklearn.model_selection import train_test_split

X = tech_df['short_description'] # the features we want to analyze
ylabels = tech_df['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [13]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7fb5712dde50>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7fb56f43eb90>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
            

In [14]:
from sklearn import metrics
from sklearn.metrics import classification_report
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted,average=None))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted,average=None))
print(classification_report(y_test, predicted))

Logistic Regression Accuracy: 0.608
Logistic Regression Precision: [0.62162162 0.54814815 0.66192171]
Logistic Regression Recall: [0.30872483 0.73267327 0.67883212]
              precision    recall  f1-score   support

    Negative       0.62      0.31      0.41       149
     Neutral       0.55      0.73      0.63       202
    Positive       0.66      0.68      0.67       274

    accuracy                           0.61       625
   macro avg       0.61      0.57      0.57       625
weighted avg       0.62      0.61      0.59       625



In [15]:
# Hyper Tuning the parameters
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

# define models and parameters
solvers = ['liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
new_grid = {'classifier__' + key: grid[key] for key in grid }
grid_search = GridSearchCV(pipe, param_grid=new_grid, n_jobs = 1, cv=3, verbose=3,refit=True)

# model generation
grid_result = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.583, total=   0.5s
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.582, total=   0.6s
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.531, total=   0.6s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.583, total=   0.4s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.584, total=   0.5s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.554, total=   0.4s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.600, total=   0.4s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.599, total=   0.4s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.550, total=   0.5s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.575, total=   0.4s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.560, total=   0.4s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.548, total=   0.4s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.427, total=   0.4s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.430, total=   0.5s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    6.9s finished


[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.432, total=   0.5s




In [16]:
grid_search.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.583, total=   0.5s
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.582, total=   0.5s
[CV] classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=100, classifier__penalty=l2, classifier__solver=liblinear, score=0.531, total=   0.4s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.583, total=   0.4s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.584, total=   0.4s
[CV] classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=10, classifier__penalty=l2, classifier__solver=liblinear, score=0.554, total=   0.4s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.600, total=   0.5s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.599, total=   0.5s
[CV] classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=1.0, classifier__penalty=l2, classifier__solver=liblinear, score=0.550, total=   0.4s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.575, total=   0.4s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.560, total=   0.5s
[CV] classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear, score=0.548, total=   0.5s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.427, total=   0.5s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 




[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.430, total=   0.4s
[CV] classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear 


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    6.8s finished


[CV]  classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear, score=0.432, total=   0.5s




GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cleaner',
                                        <__main__.predictors object at 0x7fb5712dde50>),
                                       ('vectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                        

In [17]:
# summarize results
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))

Best: 0.582704 using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


In [18]:
predicted = grid_search.predict(X_test)

# Model Accuracy
print("Logistic Regression with Hypertuning Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression with Hypertuning Precision:",metrics.precision_score(y_test, predicted,average=None))
print("Logistic Regression with Hypertuning Recall:",metrics.recall_score(y_test, predicted,average=None))

Logistic Regression with Hypertuning Accuracy: 0.608
Logistic Regression with Hypertuning Precision: [0.62162162 0.54814815 0.66192171]
Logistic Regression with Hypertuning Recall: [0.30872483 0.73267327 0.67883212]


In [19]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

    Negative       0.62      0.31      0.41       149
     Neutral       0.55      0.73      0.63       202
    Positive       0.66      0.68      0.67       274

    accuracy                           0.61       625
   macro avg       0.61      0.57      0.57       625
weighted avg       0.62      0.61      0.59       625

