In [6]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
# cols = ['sentiment','id','date','query_string','user','text']
df = pd.read_csv("./data/tweet-sentiment-extraction/train.csv")
# above line will be different depending on where you saved your data, and your file name
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [5]:
# Remove URLs
import re
re.sub('https?://[A-Za-z0-9./]+','',df.text[0])

' I`d have responded, if I were going'

In [7]:
import spacy
from spacy.matcher.phrasematcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")

In [9]:
text = "The rain in Spain falls mainly on the plain."
doc = nlp(text)
 
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)
    

The the DET True
rain rain NOUN False
in in ADP True
Spain Spain PROPN False
falls fall VERB False
mainly mainly ADV False
on on ADP True
the the DET True
plain plain NOUN False
. . PUNCT False


In [10]:
import pandas as pd
 
cols = ("text", "lemma", "POS", "explain", "stopword")
rows = []
 
for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
    rows.append(row)
 
df = pd.DataFrame(rows, columns=cols)
 
df

Unnamed: 0,text,lemma,POS,explain,stopword
0,The,the,DET,determiner,True
1,rain,rain,NOUN,noun,False
2,in,in,ADP,adposition,True
3,Spain,Spain,PROPN,proper noun,False
4,falls,fall,VERB,verb,False
5,mainly,mainly,ADV,adverb,False
6,on,on,ADP,adposition,True
7,the,the,DET,determiner,True
8,plain,plain,NOUN,noun,False
9,.,.,PUNCT,punctuation,False


In [15]:
# Remove punctuation and stop words, and show the lemmatized version
# Note token.orth_ is the original version
[token.lemma_ for token in doc if not ( token.is_punct | token.is_space | token.is_stop) ] 


['rain', 'Spain', 'fall', 'mainly', 'plain']

In [16]:

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [19]:
#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
    print(word.text,word.lemma_, word.pos_)
print("Filtered Sentence:",filtered_sent)

When when ADV
learning learn VERB
data datum NOUN
science science NOUN
, , PUNCT
you -PRON- PRON
should should VERB
n't not PART
get get AUX
discouraged discourage VERB
! ! PUNCT

 
 SPACE
Challenges challenge NOUN
and and CCONJ
setbacks setback NOUN
are be AUX
n't not PART
failures failure NOUN
, , PUNCT
they -PRON- PRON
're be AUX
just just ADV
part part NOUN
of of ADP
the the DET
journey journey NOUN
. . PUNCT
You -PRON- PRON
've have AUX
got get VERB
this this DET
! ! PUNCT
Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


In [20]:
docp = nlp (" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [31]:
def custom_tokenizer(sentence):
    """
        remove stop words, and lemmatize 
    """
    tokens = nlp(sentence)
    
    ret = [token.lemma_ for token in tokens if not ( token.is_punct | token.is_space | token.is_stop ) ]

    return ret

In [32]:
custom_tokenizer(" In pursuit of a wall, President Trump ran into one.")

['pursuit', 'wall', 'President', 'Trump', 'run']

In [33]:
bow_vector = CountVectorizer(tokenizer = custom_tokenizer, ngram_range=(1,1))

In [29]:
bow_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenizer at 0x1a3010b0e0>,
                vocabulary=None)

In [37]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/rebjl/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [34]:
tfidf_vector = TfidfVectorizer(tokenizer = custom_tokenizer)

In [39]:
import numpy as np
from nltk.probability import FreqDist
from nltk.classify import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)

from nltk.corpus import movie_reviews
pos = [FreqDist(movie_reviews.words(i)) for i in movie_reviews.fileids('pos')]
neg = [FreqDist(movie_reviews.words(i)) for i in movie_reviews.fileids('neg')]
print(pos)
add_label = lambda lst, lab: [(x, lab) for x in lst]
classif.train(add_label(pos[:100], 'pos') + add_label(neg[:100], 'neg'))

l_pos = np.array(classif.classify_many(pos[100:]))
l_neg = np.array(classif.classify_many(neg[100:]))
print("Confusion matrix:\n%d\t%d\n%d\t%d" % (
          (l_pos == 'pos').sum(), (l_pos == 'neg').sum(),
          (l_neg == 'pos').sum(), (l_neg == 'neg').sum()))

[FreqDist({'the': 46, ',': 43, "'": 25, '.': 23, 'and': 21, '(': 18, ')': 18, 'in': 18, 'a': 15, 'to': 15, ...}), FreqDist({',': 52, '.': 40, 'the': 35, 'a': 22, 'is': 22, 'and': 20, "'": 17, 'to': 14, 'of': 12, 'in': 12, ...}), FreqDist({'the': 33, ',': 28, '.': 17, "'": 11, 'it': 10, 'of': 10, 'a': 9, 'and': 9, 'as': 9, 'to': 7, ...}), FreqDist({',': 72, 'the': 63, '.': 44, 'a': 34, '"': 32, 'and': 29, 'of': 28, 'is': 22, 'it': 22, "'": 19, ...}), FreqDist({',': 55, 'the': 41, '.': 25, 'a': 20, 'in': 20, 'and': 18, "'": 16, 'of': 15, 'to': 15, '-': 12, ...}), FreqDist({',': 62, 'the': 61, 'of': 37, '.': 34, 'and': 27, 'a': 26, 'to': 23, 'his': 20, 'in': 13, 'lumumba': 13, ...}), FreqDist({',': 58, 'a': 37, 'the': 33, '.': 32, 'and': 22, 'of': 21, "'": 20, 'in': 19, 'to': 18, 'is': 17, ...}), FreqDist({',': 45, '.': 32, 'the': 29, 'a': 26, 'and': 25, "'": 24, 's': 20, 'is': 20, '-': 18, 'of': 13, ...}), FreqDist({',': 24, "'": 15, '.': 13, 'the': 11, 's': 11, 'and': 10, 'lumumba': 8, 

Confusion matrix:
427	473
132	768


In [267]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

class Debug(TransformerMixin, BaseEstimator):
    def __init__(self, demo_param='demo'):
        self.demo_param = demo_param
        self.fit_result = []
        self.transform_result = []

    def fit(self, X, y=None):
        # Return the transformer
        self.fit_result = X
        return self

    def transform(self, X):
        # No op transform
        self.transform_result = X
        return X

def sparse_to_df(sp_matrix, feat_names):
    return pd.DataFrame(sp_matrix.todense(), columns=feat_names)

def convert_log_prob_to_df_prob(log_prob, class_names):
    # Transpose the class labels into the columns
    prob_arr = np.array(log_prob)
    prob_arr = np.transpose(prob_arr)
    print(prob_arr)
    probs = pd.DataFrame(prob_arr, columns = class_names)
    # Unlog by taking exponents
    probs = probs.apply(np.exp)
    # Add the feature names as the first column
    probs['feature'] = k_best_feat
    probs.set_index('feature', inplace=True)
    return probs

# initialize list of lists 
weather =  [
        # ['Sunny','Hot','Normal','Calm','Y'],
        ['Overcast','Mild','Normal','Calm','Y'],
        ['Sunny','Cool','Normal','Windy','Y'],
        ['Sunny','Hot','Normal','Windy','N'],
        ['Overcast','Cool','Humid','Windy','N'],
        ['Sunny','Mild','Humid','Calm','Y'],
        ['Overcast','Mild','Normal','Calm','Y'],
        ['Rainy','Cool','Humid','Windy','N'],
        ['Rainy','Hot','Normal','Windy','N']]
  
# Create the pandas DataFrame 
df = pd.DataFrame(weather, columns = ['Weather','Temperature','Humidity','Wind','Nice']) 
df_text = df["Weather"] + " " + df["Temperature"] + " " + df["Humidity"] + " " +  df["Wind"]

X = df_text
y = df['Nice']

pipe = Pipeline([('count', CountVectorizer()),
                 ('countvectorizer_debug', Debug()),
                 # ('tf_idf', TfidfTransformer(norm=None)),
                 # ('tf_idf_debug', Debug()),
                 ('chi2', SelectKBest(chi2, k=3)),
                 ('kbest_debug', Debug()),
                 ('clf', MultinomialNB(alpha=0))])

result = pipe.fit(X, y)

feat_names = pipe['count'].get_feature_names()

print(sparse_to_df(pipe['countvectorizer_debug'].fit_result, pipe['count'].get_feature_names() ) )

print("K best features")
k_best_feat = [feat_names[i] for i in pipe['chi2'].get_support(indices=True)]
k_best_df = sparse_to_df(pipe['kbest_debug'].fit_result, k_best_feat)
k_best_df['Nice actuals'] = y

y_preds = pipe.predict(X)
print(y_preds)
# Confusion matrix and classification report
c_m = confusion_matrix(y, y_preds)
print(c_m)
print(classification_report(y, y_preds))

k_best_df['Nice predictions'] = pd.Series(y_preds)
print("K best features with predictions")
print(k_best_df)

probs = convert_log_prob_to_df_prob(pipe['clf'].feature_log_prob_, pipe['clf'].classes_)

# For each feature, calculate it's p(feature | outcome)
rows = []

for f in k_best_feat:
    vcs = [f]
    for c in pipe['clf'].classes_ :
        vc = k_best_df[k_best_df['Nice actuals'] == c][f].value_counts()
        # Get the 1's
        vc = vc.get(1) if vc.get(1) else 0
        vcs.append(vc)
    rows.append(vcs)

cols = ['features']
cols.extend(pipe['clf'].classes_)

likelihoods_df = pd.DataFrame(rows, columns = cols)
for c in pipe['clf'].classes_ :
    likelihoods_df[c] = likelihoods_df[c] / likelihoods_df[c].sum()
print(likelihoods_df)
    


   calm  cool  hot  humid  mild  normal  overcast  rainy  sunny  windy
0     1     0    0      0     1       1         1      0      0      0
1     0     1    0      0     0       1         0      0      1      1
2     0     0    1      0     0       1         0      0      1      1
3     0     1    0      1     0       0         1      0      0      1
4     1     0    0      1     1       0         0      0      1      0
5     1     0    0      0     1       1         1      0      0      0
6     0     1    0      1     0       0         0      1      0      1
7     0     0    1      0     0       1         0      1      0      1
K best features
['Y' 'N' 'N' 'N' 'Y' 'Y' 'N' 'N']
[[4 0]
 [1 3]]
              precision    recall  f1-score   support

           N       0.80      1.00      0.89         4
           Y       1.00      0.75      0.86         4

    accuracy                           0.88         8
   macro avg       0.90      0.88      0.87         8
weighted avg       0.90 

  'setting alpha = %.1e' % _ALPHA_MIN)


In [57]:
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
docs = ["You can catch more flies with honey than you can with vinegar.",
         "You can lead a horse to water, but you can't make him drink."]
vect = CountVectorizer(min_df=0., max_df=1.0)
X = vect.fit_transform(docs)
print(DataFrame(X.A, columns=vect.get_feature_names()).to_string())

   but  can  catch  drink  flies  him  honey  horse  lead  make  more  than  to  vinegar  water  with  you
0    0    2      1      0      1    0      1      0     0     0     1     1   0        1      0     2    2
1    1    2      0      1      0    1      0      1     1     1     0     0   1        0      1     0    2


In [84]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureMultiplier(BaseEstimator, TransformerMixin):
    def __init__(self, factor):
        self.factor = factor

    def transform(self, X, *_):
        return X * self.factor

    def fit(self, *_):
        return self

fm = FeatureMultiplier(2)

test = np.diag((1,2,3,4))
print(test)

fm.transform(test)

[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]


array([[2, 0, 0, 0],
       [0, 4, 0, 0],
       [0, 0, 6, 0],
       [0, 0, 0, 8]])

In [124]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X, y = load_iris(return_X_y=True)
X.shape

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

(150, 2)

In [128]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [226]:
k_best_df['calm'].value_counts()


0    5
1    3
Name: calm, dtype: int64

In [None]:

corpus = ['This little piggy went to market',
          'This little piggy stayed home',
          'This little piggy had roast beef',
          'This little piggy had none',
          'And this little piggy cried "wee wee wee" all the way home']
# Happiness of pigs
y = [0, 1, 1, 0, 0]

In [233]:
 k_best_df[k_best_df['Nice actuals'] == 'Y'][f].value_counts()

0    4
Name: rainy, dtype: int64

In [232]:
type( k_best_df[k_best_df['Nice actuals'] == 'Y'][f].value_counts())

pandas.core.series.Series