In [3]:
import pandas as pd
import preprocessor as p
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import mlflow
import mlflow.sklearn
import nltk
import ssl
import string
import re
import matplotlib.pyplot as plt

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.MENTION, p.OPT.SMILEY)


def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    puctuation = list(string.punctuation)
    stopwords = set(stopwords).union(puctuation)
    text = p.clean(text)
    text = re.sub('[^a-z\s]', '', text.lower())                  # get rid of noise
    text = [w for w in text.split() if w not in set(stopwords)]  # remove stopwords
    return ' '.join(text)

def load_data():

    df = pd.read_csv("flair_labeled_sentiments_long.csv", names=['feedback', 'text'])
    df['feedback'] = df['feedback'].replace('__label__Negative', 0)
    df['feedback'] = df['feedback'].replace('__label__Positive', 1)
    df['text']=df['text'].apply(clean_text)
    df = df.drop_duplicates(subset=['text'], keep='first')
    X = df['text']
    ylabels = df['feedback']
    X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


def eval_metrics(actual, pred):
    matrix = confusion_matrix(actual,pred)
    report = classification_report(actual,pred)
    accuracy = accuracy_score(actual,pred)
    return matrix, report, accuracy

def plot_auc_roc_curve(actual, pred):
    fig = plt.figure(1)
    fpr_RF, tpr_RF, thresholds_RF = roc_curve(actual, pred)
    auc_RF = roc_auc_score(actual, pred)
    plt.plot(fpr_RF, tpr_RF,'r-',label = 'RF AUC: %.3f'%auc_RF)
    plt.plot([0,1],[0,1],'k-',label='random')
    plt.plot([0,0,1,1],[0,1,1,1],'g-',label='perfect')
    plt.legend()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    image = fig

    # Save figure
    fig.savefig("RoC.png")

    # Close plot
    plt.close(fig)

    # Return images
    return image

def create_model():
    X_train, X_test, y_train, y_test = load_data()
    with mlflow.start_run():
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', RandomForestClassifier(n_estimators=1000, random_state=0)),
        ])
        pipeline.fit(X_train,y_train)
        y_pred = pipeline.predict(X_test)
        matrix, report, accuracy = eval_metrics(y_test,y_pred)
        print("Confusion Matrix : " + str(matrix))
        print("Classification Report : " + str(report))
        print("Accuracy : " + str(accuracy))
        # mlflow.log_param("confsusion_matrix", str(matrix))
        # mlflow.log_param("classification_report", str(report))
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(pipeline, "model")
        modelpath = "sklearn_model/model-%f" % (accuracy)
        mlflow.sklearn.save_model(pipeline, modelpath)
        image = plot_auc_roc_curve(y_test,y_pred)
        mlflow.log_artifact("RoC.png" % (accuracy))

create_model()


  text = re.sub('[^a-z\s]', '', text.lower())                  # get rid of noise
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: module 'preprocessor' has no attribute 'set_options'