#Model

In [None]:
# Loading the required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import nltk
nltk.download('stopwords')
# %matplotlib inline
from sklearn.pipeline import Pipeline

In [0]:
# Loading the dataset formed by scraping Instagram throgh Instaloader
df = pd.read_excel('newdata.xlsx')

In [None]:
# Converting the caption to string type
df['caption'] = df['caption'].astype(str)

In [0]:
df.shape

(14002, 7)

In [0]:
df.columns

Index(['username', 'image_url', 'following', 'followers', 'likes', 'caption',
       'category'],
      dtype='object')

In [0]:
# Dropping the rows with null values
df = df[df.category.isnull() == False]

In [0]:
df.isnull().sum()

username      0
image_url     0
following     0
followers     0
likes         0
caption      16
category      0
dtype: int64

In [0]:
# Descriptive statistics of the dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
following,14002.0,997.275818,1557.930361,0.0,113.0,385.0,1034.0,7652.0
followers,14002.0,5623.038637,132565.263353,0.0,219.0,710.0,2288.0,15501290.0
likes,14002.0,175.3021,2608.578898,0.0,11.0,31.0,93.0,298709.0
category,14002.0,0.357163,0.479181,0.0,0.0,0.0,1.0,1.0


In [0]:
# Value counts of positives(1) and negatives(0)
df['category'].value_counts()

0.0    9001
1.0    5001
Name: category, dtype: int64

In [0]:
# Cleaning the text data by stripping blank spaces, removing unnecessary tokens.
# Text pre-processing : Removing stopwords and stemming

stemmer = PorterStemmer()
words = stopwords.words("english")
df['caption'] = df['caption'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z0-9]", " ", x).split() if i not in words]).lower())

# Using TFIDF Vectorizer to form numerical features
vectorizer = TfidfVectorizer(min_df=3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))

In [0]:
# Selecting top 1000 best features using SelectKBest and storing it in an array
pipe = Pipeline([('vect', vectorizer), ('chi',  SelectKBest(chi2, k=1000))])
text_features = pipe.fit_transform(df['caption'], df['category'])
text_features = text_features.toarray()

In [0]:
text_features.shape

(14002, 1000)

In [0]:
# Categorising the target variable
df['category'] = pd.Categorical(df['category'])

In [0]:
# Creating training and test datasets
X = text_features
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Fitting the model
# Linear SVC is used since the data is sparsed and Support vectors work well on such kind of dataset
model = LinearSVC(class_weight='balanced', max_iter=5000)
model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [0]:
# Performing prediction on test dataset
preds = model.predict(X_test)
print('Final prediction score: [%.8f]' % accuracy_score(y_test, preds))
print('Final prediction f1 score: [%.8f]' % f1_score(y_test, preds, average='weighted'))

Final prediction score: [0.91181721]
Final prediction f1 score: [0.91094302]


In [0]:
# Printing classification report and confusion matrix
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93      1804
         1.0       0.91      0.84      0.87       997

    accuracy                           0.91      2801
   macro avg       0.91      0.89      0.90      2801
weighted avg       0.91      0.91      0.91      2801

[[1720   84]
 [ 163  834]]
