# ML Model to predict the AWS Product

### Import the necessary libraries

In [None]:
import pandas as pd
import csv

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline
pd.set_option('display.max_colwidth', 150)

### (A) Read the CSV file

In [None]:
df = pd.read_csv('C:\\Users\\Aruna\\Documents\\ACMS-IID\\input\\CleanText.csv')
df['description'] = df['description'].apply(lambda x: " ".join(x for x in str(x).split()))
df.head(10)

#### Check the spread of tags in the data

In [None]:
df.info()

In [None]:
df['label'].describe()

In [None]:
plt.figure(figsize=(10,4))
df.label.value_counts().plot(kind='bar');

#### Check out one sample post

In [None]:
df['description'][100]

#### Top 30 words + frequency of each:

In [None]:
freq = pd.Series(' '.join(df['description']).split()).value_counts()[:30]
freq

In [None]:
print("There are totally", df['description'].apply(lambda x: len(x.split(' '))).sum(), "words.")

### (C) Feature Engineering & Model Deployment

In [None]:
from sklearn.model_selection import train_test_split

X = df['description']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Naive Bayes Classifier for Multinomial Models

In [None]:
nb = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())])

nb.fit(X_train, y_train)

In [None]:
predictions = nb.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))
print('accuracy:', accuracy_score(predictions, y_test))

## LogisticRegression Model

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', TfidfVectorizer(max_features = 10000, min_df = 0.2)), ('clf', LogisticRegression(n_jobs = 1))])

logreg.fit(X_train, y_train)

In [None]:
predictions = logreg.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))
print('accuracy:', accuracy_score(predictions, y_test))

## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([('vect', TfidfVectorizer()), ('clf', RandomForestClassifier())])

rf.fit(X_train, y_train)

In [None]:
predictions = rf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))
print('accuracy:', accuracy_score(predictions, y_test))

## Linear SVC Model

In [None]:
from sklearn.svm import LinearSVC

svm = Pipeline([('vect', TfidfVectorizer(max_features = 10000)), ('clf', LinearSVC())])

svm.fit(X_train, y_train)

In [None]:
predictions = svm.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))
print('accuracy:', accuracy_score(predictions, y_test))

## XGBoost Model

In [None]:
from xgboost import XGBClassifier

xgb = Pipeline([('vect', TfidfVectorizer()), ('clf', XGBClassifier())])
xgb.fit(X_train, y_train)

In [None]:
predictions = xgb.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test, predictions))
print('accuracy:', accuracy_score(predictions, y_test))