In [75]:

import sentiment1
import json
from bson import json_util
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import logging
import os
import warnings
import sys


In [76]:

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

sanitized = json.loads(json_util.dumps(sentiment1.temp_col.find()))
normalized = pd.json_normalize(sanitized)
df = pd.DataFrame(normalized)
df.sort_values('id', ascending = True, inplace = True)
df = df[pd.notnull(df['text'])]

df['Polarity_int'] = df['Polarity'].factorize()[0]

polarity_id_df = df[['Polarity', 'Polarity_int']].drop_duplicates().sort_values('Polarity_int')
polarity_to_id = dict(polarity_id_df.values)
id_to_polarity = dict(polarity_id_df[['Polarity_int', 'Polarity']].values)
print(df["Polarity_int"])




0       0
1       1
2       2
3       2
4       1
       ..
3941    1
3942    1
3943    2
3944    1
3945    1
Name: Polarity_int, Length: 3946, dtype: int64


In [77]:
df.columns

Index(['index', 'id', 'text', 'tweet', 'Sentiment', 'Score', 'Polarity',
       'Tweet_punct', '_id.$oid', 'Polarity_int'],
      dtype='object')

In [78]:
df.head(5)


Unnamed: 0,index,id,text,tweet,Sentiment,Score,Polarity,Tweet_punct,_id.$oid,Polarity_int
0,0,1.502671e+18,RT @dabeard: Tip for today:\nIf anyone gets in...,"[rt, dabeard, tip, today, anyone, get, face, a...",Neutral,1.0,Neutral,rt dabeard tip for today\nif anyone gets in yo...,622d44f39fcffa7dd4311aa5,0
1,1,1.502671e+18,"#COVID19Germany #BA2variant #BA2 48%\n""The nu...","[covidgermany, bavariant, ba, number, people, ...",Happy,0.262,Positive,covidgermany bavariant ba \nthe number of peo...,622d44f39fcffa7dd4311aa6,1
2,2,1.502671e+18,RT @ignis_fatum: ‼️#Putin regime propaganda ag...,"[rt, ignisfatum, putin, regime, propaganda, ag...",sad,0.251,Negative,rt ignisfatum ‼️putin regime propaganda agency...,622d44f39fcffa7dd4311aa7,2
3,3,1.502671e+18,RT @EvanBlake17: @EricTopol @OurWorldInData Re...,"[rt, evanblake, erictopol, ourworldindata, rec...",sad,0.218,Negative,rt evanblake erictopol ourworldindata record c...,622d44f39fcffa7dd4311aa8,2
4,4,1.502671e+18,RT @SaitamaUniverse: CryptoExpoDubai (Mar. 16-...,"[rt, saitamauniverse, cryptoexpodubai, mar, la...",Happy,0.231,Positive,rt saitamauniverse cryptoexpodubai mar is the...,622d44f39fcffa7dd4311aa9,1


In [79]:
df=df[["text","Polarity","Polarity_int"]]
df.head(5)


Unnamed: 0,text,Polarity,Polarity_int
0,RT @dabeard: Tip for today:\nIf anyone gets in...,Neutral,0
1,"#COVID19Germany #BA2variant #BA2 48%\n""The nu...",Positive,1
2,RT @ignis_fatum: ‼️#Putin regime propaganda ag...,Negative,2
3,RT @EvanBlake17: @EricTopol @OurWorldInData Re...,Negative,2
4,RT @SaitamaUniverse: CryptoExpoDubai (Mar. 16-...,Positive,1


In [80]:


from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, norm='l2', encoding='latin-1', ngram_range=(1,10), stop_words='english')

features = tfidf.fit_transform(df['text']).toarray()
labels = df['Polarity_int']
features.shape

(3946, 3042)

In [81]:
from sklearn.feature_selection import chi2
import numpy as np

N = 100000
for Polarity, polarity_int in sorted(polarity_to_id.items()):
  features_chi2 = chi2(features, labels == polarity_int)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Polarity))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

# 'Negative':
  . Most correlated unigrams:
       . therealplanc
       . held
       . brigade
       . founder
       . old
       . zelensky
       . billion
       . region
       . st
       . look
       . international
       . bullish
       . 17
       . story
       . eye
       . watch
       . mariupol
       . covid19
       . learn
       . ba
       . 11
       . recently
       . times
       . 1000
       . use
       . leaders
       . hear
       . think
       . buy
       . develops
       . moscow
       . donetsk
       . realise
       . going
       . 2021
       . family
       . pump
       . civilians
       . saying
       . exchange
       . feel
       . major
       . response
       . doing
       . test
       . french
       . want
       . shot
       . told
       . following
       . kherson
       . does
       . continue
       . likely
       . doesn
       . used
       . 24
       . telling
       . usa
       . updates
       . according
   

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score



X = df['text']
y = df['Polarity_int']

    # mlflow.log_param("training percencentage",70)
    # mlflow.log_param("database shape",df.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

clf_lr = LogisticRegression()
clf_lr.fit(X_train_dtm, y_train)
y_pred = clf_lr.predict(X_test_dtm)
lr_score = accuracy_score(y_test, y_pred) # perfectly balanced binary classes

clf_mnb = MultinomialNB()
clf_mnb.fit(X_train_dtm, y_train)
y_pred = clf_mnb.predict(X_test_dtm)
mnb_score = accuracy_score(y_test, y_pred) # perfectly balanced binary classes

In [84]:
y.shape

(3946,)

In [85]:
print(X_test_dtm)


  (0, 7299)	0.23677396806079018
  (0, 6475)	0.06377490140019665
  (0, 6231)	0.22038067947097054
  (0, 4932)	0.44974743987619564
  (0, 4547)	0.2380361974571192
  (0, 3910)	0.2708583351655942
  (0, 2958)	0.2708583351655942
  (0, 2954)	0.5028505395106776
  (0, 827)	0.15653069589869087
  (0, 743)	0.2380361974571192
  (0, 736)	0.38938282517223594
  (1, 6475)	0.17799651793789464
  (1, 3378)	0.6386811931166922
  (1, 1821)	0.7486010774511502
  (2, 6978)	0.4845969567610618
  (2, 6475)	0.08551376195761282
  (2, 6281)	0.40365594110875835
  (2, 3920)	0.32846250670460675
  (2, 3735)	0.1144564382761799
  (2, 3437)	0.3445920023649643
  (2, 2491)	0.35630848218187666
  (2, 1225)	0.1256087919563414
  (2, 654)	0.460923227297621
  (3, 6475)	0.09892760395236563
  (3, 5109)	0.5137929198668812
  :	:
  (982, 3735)	0.3131306982614785
  (983, 5940)	0.5916667147262804
  (983, 5237)	0.8061826707918065
  (984, 7396)	0.5533406928217307
  (984, 6475)	0.09764453455028411
  (984, 5208)	0.3243995783698054
  (984, 4266)

In [86]:
prediction=clf_mnb.predict(vectorizer.transform(["OurWorldInData"]))
print(type(prediction))

<class 'numpy.ndarray'>


In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import warnings
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse




def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def fetch_logged_data(run_id):

    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts



mlflow.log_artifacts

mlflow.set_tracking_uri("http://127.0.0.1:5000")

#this is model registry
registry_uri='sqlite:///mlflow.db'

#update the date in the real time from the above sql lite db

mlflow.tracking.set_tracking_uri(registry_uri)

mlflow.end_run()





if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

with mlflow.start_run():
    model=LogisticRegression(random_state=0)
    model.fit(X_train_dtm.toarray(), y_train)

    predicted_qualities = model.predict(X_test_dtm.toarray())

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)


    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

        
        
        
    #mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":

    # Register the model
    # There are other ways to use the Model Registry, which depends on the use case,
    # please refer to the doc for more information:
    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(model,'Logistic regression models',registered_model_name='Logistic regression model')
    else:
        mlflow.sklearn.log_model(model, "model")

mlflow.end_run()        

        

  RMSE: 0.5954913341754137
  MAE: 0.2553191489361702
  R2: 0.4436015384912735


Registered model 'Logistic regression model' already exists. Creating a new version of this model...
2022/03/14 11:19:59 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Logistic regression model, version 3
Created version '3' of model 'Logistic regression model'.


In [None]:
predicted_qualities.shape,y_test.shape


((987,), (987,))