In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('NEW_DATA/MERGED_DATA.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,created_utc,flair,flair_colour,flair_text_colour,num_comments,score,text
0,0,1580498000.0,| Repost |,,dark,1,0,"Economic Survey sources data from Wikipedia, o..."
1,1,1580499000.0,| Low-effort Self Post |,,dark,0,1,"How do you explain ""vaali"" ?[removed]"
2,2,1580467000.0,Politics,#ddbd37,dark,12,140,"Despite Jamia Shooting, Amit Shah's Divisive R..."
3,3,1580480000.0,CAA-NRC,,dark,16,8,Youth of India: Voices of Reason or Pawns[remo...
4,4,1580486000.0,Non-Political,#5093d6,dark,6,13,"Indian Student Variety AbroadSo, I am kinda fe..."


In [4]:
data.columns

Index(['Unnamed: 0', 'created_utc', 'flair', 'flair_colour',
       'flair_text_colour', 'num_comments', 'score', 'text'],
      dtype='object')

In [5]:
data = data.drop('Unnamed: 0', axis = 1)

In [6]:
data.shape

(55863, 7)

Data has many redundant rows. Let's clean them.

In [7]:
data = data.drop_duplicates()

In [8]:
data.shape

(55856, 7)

In [9]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 
PLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
#     text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = PLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
data['text'] = data['text'].apply(clean_text)



In [10]:
data.shape

(55856, 7)

In [11]:
data.head()

Unnamed: 0,created_utc,flair,flair_colour,flair_text_colour,num_comments,score,text
0,1580498000.0,| Repost |,,dark,1,0,economic survey sources data wikipedia private...
1,1580499000.0,| Low-effort Self Post |,,dark,0,1,explain vaali removed
2,1580467000.0,Politics,#ddbd37,dark,12,140,despite jamia shooting amit shahs divisive rhe...
3,1580480000.0,CAA-NRC,,dark,16,8,youth india voices reason pawns removed
4,1580486000.0,Non-Political,#5093d6,dark,6,13,indian student variety abroadso kinda feeling ...


In [12]:
data['text'].apply(lambda x: len(x.split(' '))).sum()

971169

In [72]:
rec = data['flair'].value_counts()

In [76]:
rec

9529

In [90]:
flair = ['Coronavirus', 'Politics', 'Non-Political', 'AskIndia', 'Policy/Economy', 'Scheduled', 'Photography', 'Business/Finance', 'others', 'Unverified', 'Science/Technology', 'Food', 'CAA-NRC-NPR', 'Megathread', 'Meta.' ]

In [18]:
data['flair'].fillna('Others',inplace = True)

In [15]:
data['flair_colour'].fillna('Others',inplace = True)

In [88]:
data.isnull().sum()

created_utc          0
flair                0
flair_colour         0
flair_text_colour    0
num_comments         0
score                0
text                 0
dtype: int64

In [68]:
from sklearn.model_selection import train_test_split
X = data[['text', 'flair_colour', 'flair_text_colour']]
y = data.flair
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

In [92]:
X

Unnamed: 0,text,flair_colour,flair_text_colour
0,economic survey sources data wikipedia private...,Others,dark
1,explain vaali removed,Others,dark
2,despite jamia shooting amit shahs divisive rhe...,#ddbd37,dark
3,youth india voices reason pawns removed,Others,dark
4,indian student variety abroadso kinda feeling ...,#5093d6,dark
...,...,...,...
55858,old ask boy surrender army man asks family kas...,#5093d6,dark
55859,youre thoughts ayodhya verdict removed,Others,dark
55860,spitfire x gangs wasseypur,#5093d6,dark
55861,ayodhya verdict historic supreme court verdict...,#ddbd37,dark


In [94]:
train = X['text'] +" "+ X['flair_colour'] +" "+ X['flair_text_colour']
# X_test = X_test['text'] + " " + X_test['flair_colour']+" "+ X_test['flair_text_colour']

In [70]:
X_train[1001]

'first go single indian muslim deported says shahnawaz hussain #ddbd37 dark'

In [96]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score,classification_report

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(train, y)

# %%time

# y_pred = sgd.predict(X_test)

# print('accuracy %s' % accuracy_score(y_pred, y_test))
# print(classification_report(y_test, y_pred))

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [79]:
import pickle

In [81]:
saved_model = pickle.dumps(sgd)

In [85]:
import joblib 

In [97]:
joblib.dump(sgd, 'MODEL_1_DEMO.pkl') 

['MODEL_1_DEMO.pkl']

In [84]:
ls

DataAccumulation_Pre_CleaningData.ipynb
ExperimentDetails.ipynb
FeatureCleaningAndModels.ipynb
MODEL_1_DEMO.pkl
[34mNEW_DATA[m[m/
[34mOLD_DATA[m[m/
ScrapperPraw.ipynb
ScrapperScript.ipynb
Untitled1.ipynb
reddit-data.json
scrapper-parallel-1.ipynb
scrapper-parallel-2.ipynb
scrapper-parallel-3.ipynb
srapper-parallel-4.ipynb


In [86]:
model = joblib.load('MODEL_1_DEMO.pkl')
y_pred= model.predict(X_test)

In [87]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.7423801638096943
