In [1]:
"""Read the analysed data into a dataframe"""
import pandas as pd

df = pd.read_csv(r'C:\Users\rajesh kumar\Documents\webScraper\analysedData.csv',encoding = 'utf-8')
df.shape

(811, 8)

In [2]:
all_flairs=df.flair.unique()

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [4]:
"""
Spiltting the data
Train:test=70:30
Training data is the words from both title and description
Testing data is the flair of the posts
"""
X = df.both
y = df.flair
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [5]:
"""Performing Naive Bayes algorithm"""
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [6]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=all_flairs))

accuracy 0.6024590163934426
                precision    recall  f1-score   support

   Coronavirus       0.55      0.88      0.67        48
      Politics       0.88      0.52      0.65        54
 Non-Political       0.64      0.26      0.37        53
Policy/Economy       1.00      0.05      0.09        21
      AskIndia       0.46      0.89      0.60        46
     Scheduled       0.95      0.95      0.95        22

      accuracy                           0.60       244
     macro avg       0.74      0.59      0.56       244
  weighted avg       0.70      0.60      0.56       244

Wall time: 208 ms


In [7]:
"""Performing SGDClassifier"""
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [8]:
%%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=all_flairs))

accuracy 0.7213114754098361
                precision    recall  f1-score   support

   Coronavirus       0.67      0.69      0.68        48
      Politics       0.77      0.89      0.83        54
 Non-Political       0.67      0.49      0.57        53
Policy/Economy       0.76      0.62      0.68        21
      AskIndia       0.64      0.76      0.69        46
     Scheduled       0.95      0.95      0.95        22

      accuracy                           0.72       244
     macro avg       0.74      0.73      0.73       244
  weighted avg       0.72      0.72      0.72       244

Wall time: 40.1 ms


In [9]:
"""Performing Logistic Regression"""
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [10]:
%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=all_flairs))

accuracy 0.680327868852459
                precision    recall  f1-score   support

   Coronavirus       0.60      0.60      0.60        48
      Politics       0.78      0.80      0.79        54
 Non-Political       0.58      0.53      0.55        53
Policy/Economy       0.73      0.52      0.61        21
      AskIndia       0.61      0.74      0.67        46
     Scheduled       0.95      0.95      0.95        22

      accuracy                           0.68       244
     macro avg       0.71      0.69      0.70       244
  weighted avg       0.68      0.68      0.68       244

Wall time: 40.1 ms


In [13]:
"""
Logistic Regression performs best
Doing a random test"""
import praw
from processText import perform
def detector(link):
    r=praw.Reddit(client_id='CfAnZNaCtwGKtQ', client_secret='CdMiYf44PL1FrmfK8j0oZgVH-J4', user_agent='srk_test_0110')
    submission=r.submission(url=input_link)
    title=submission.title
    description=submission.selftext
    both=title+description
    logreg = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression(n_jobs=1, C=1e5)),])
    logreg.fit(X_train, y_train)
    model=logreg
    new_input = [perform(both)]
    # get prediction for new input
    new_output = model.predict(new_input)
    # summarize input and output
    return new_input, new_output

In [14]:
input_link="https://www.reddit.com/r/india/comments/g7bk1p/while_a_so_called_journalist_cries_about_an/"
detector(input_link)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(['call journalist cry attack never forget real attack journalist'],
 array(['Politics'], dtype=object))