In [1]:
%run __init__.py

In [2]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, MultiLabelBinarizer

from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [3]:
apple_news_df = pd.read_csv('./data/apple_news_cleaned.csv.gz')

In [4]:
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(apple_news_df['sentiment'].values.reshape(-1,1))

In [5]:
# MultiLabelBinarizer(apple_news_df['sentiment'])
# mlb = MultiLabelBinarizer()
# y_bin = mlb.fit_transform(apple_news_df['sentiment'])

In [6]:
def class_weights_ohe(Y_ohe):
    #  n_samples / (n_classes * np.bincount(y))
    # https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html
    bincount = np.array([np.bincount(Y_ohe[:,x].astype(np.int32))[1] for x in range(0, Y_ohe.shape[1])])
    weights = np.array(Y_ohe.shape[0] / (Y_ohe.shape[1] * bincount))
    class_weights={k:v for k,v in enumerate(weights)}
    return weights

In [7]:
weights = {k+1:v for k,v in enumerate(class_weights_ohe(y_ohe))}
weights

{1: 1.482866043613707, 2: 1.0493827160493827, 3: 0.7284970921334558}

In [8]:
tfidfVectorizer = TfidfVectorizer(max_features=10_000,
                                  norm='l2', 
                                  strip_accents='ascii',
                                  analyzer='word',
                                  ngram_range=(1,1))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(apple_news_df['text'],
                                                    y_ohe,
                                                    test_size=.15,
                                                    random_state=1,
                                                    stratify=y_ohe
                                                   )

In [10]:
X_train = tfidfVectorizer.fit_transform(X_train).toarray()

In [11]:
X_test = tfidfVectorizer.fit_transform(X_test).todense()

In [12]:
X_train.shape, y_train.shape

((8092, 10000), (8092, 3))

In [57]:
# from sklearn.utils import class_weight
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder().fit_transform(apple_news_df['sentiment'])
# class_weight.compute_class_weight('balanced',np.unique( apple_news_df['sentiment'] ),y_train)

In [13]:
# sgd = SGDClassifier()loss="modified_huber"
# nb = MultinomialNB()
LogisticRegression()multi_class=”multinomial”

In [24]:
lr = LogisticRegression(multi_class='multinomial')

In [None]:
lr.fit(X_train, y_train)

In [21]:
clf = OneVsRestClassifier(LogisticRegression(multi_class='multinomial')).fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = clf.predict(y_test)

In [17]:
y_test.shape

(1428, 3)

In [32]:
clf = OneVsRestClassifier(LogisticRegression())

In [33]:
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [34]:
print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.01330532212885154


In [None]:
pipeline = Pipeline(
    [
        ("vectorizer", CountVectorizer()),
        ("tfidf_trans", TfidfTransformer())
    ]
                    )

In [None]:
np.