In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
import time
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
nltk.download('stopwords')

In [None]:
df=pd.read_csv('../input/train_file.csv')

In [None]:
test=pd.read_csv('../input/test_file.csv')

In [None]:
idx=pd.read_csv('../input/test_file.csv')
idx=idx.set_index('IDLink')

In [None]:
idx.head()

In [76]:
df['Source'].value_counts()

Bloomberg                        992
Reuters                          763
ABC News                         645
New York Times                   573
The Guardian                     551
                                ... 
World Policy Institute (blog)      1
Enstarz                            1
LBC 97.3                           1
Cities Today                       1
Sojourners                         1
Name: Source, Length: 4753, dtype: int64

In [77]:
df.isnull().sum()

IDLink                 0
Title                  0
Headline               0
Source               175
Topic                  0
PublishDate            0
Facebook               0
GooglePlus             0
LinkedIn               0
SentimentTitle         0
SentimentHeadline      0
dtype: int64

In [78]:
l=df[df['Source'].isnull()==True] 

In [79]:
l['Facebook'].value_counts()

 0     127
-1      34
 1       4
 2       4
 3       2
 13      1
 9       1
 5       1
 4       1
Name: Facebook, dtype: int64

In [80]:
l['GooglePlus'].value_counts()

 0    139
-1     34
 1      1
 8      1
Name: GooglePlus, dtype: int64

In [81]:
l['LinkedIn'].value_counts()

 0    140
-1     34
 1      1
Name: LinkedIn, dtype: int64

##  Here we can see that the news with no sources have negligible impact on the popularity.

In [82]:
bloomberg=df[df['Source']=='Bloomberg'] 

In [83]:
bloomberg['Facebook'].value_counts()

1       117
2        78
0        66
3        64
4        48
       ... 
72        1
139       1
75        1
137       1
1982      1
Name: Facebook, Length: 170, dtype: int64

In [84]:
bloomberg['GooglePlus'].value_counts()

 0      540
 1      161
 2       63
-1       44
 3       31
 4       28
 5       25
 7       13
 8       11
 6       10
 9        7
 12       6
 14       6
 11       4
 15       3
 22       3
 21       3
 13       2
 10       2
 16       2
 18       2
 19       2
 33       2
 23       2
 121      1
 37       1
 109      1
 88       1
 55       1
 54       1
 47       1
 46       1
 43       1
 39       1
 36       1
 30       1
 29       1
 28       1
 26       1
 25       1
 17       1
 24       1
 57       1
 74       1
Name: GooglePlus, dtype: int64

In [85]:
bloomberg['LinkedIn'].value_counts()

 0      247
 1      102
 2       57
-1       44
 4       37
       ... 
 133      1
 129      1
 72       1
 73       1
 91       1
Name: LinkedIn, Length: 157, dtype: int64

# Popular Sources have thier corresponding effect on the popularity scores so there is no need of keeping source in the model, it will only increase the popularity.

In [116]:
fb_max_viewers=df['Facebook'].sort_values().tail(10).values.tolist()
gp_max_viewers=df['GooglePlus'].sort_values().tail(10).values.tolist()
li_max_viewers=df['LinkedIn'].sort_values().tail(10).values.tolist()

In [117]:
Best_News_Sources=[]
Best_Topics=[]
for i in fb_max_viewers:
    Best_News_Sources.append(df[df['Facebook']==i]['Source'].values)
    Best_Topics.append(df[df['Facebook']==i]['Topic'].values)
for i in gp_max_viewers:
    Best_News_Sources.append(df[df['GooglePlus']==i]['Source'].values)
    Best_Topics.append(df[df['GooglePlus']==i]['Topic'].values)
for i in li_max_viewers:
    Best_News_Sources.append(df[df['LinkedIn']==i]['Source'].values)
    Best_Topics.append(df[df['LinkedIn']==i]['Topic'].values)

In [124]:
Best_News_Sources

[array(['GameZone'], dtype=object),
 array(['GameZone'], dtype=object),
 array(['Telegraph.co.uk'], dtype=object),
 array(['Breitbart News'], dtype=object),
 array(['CNN'], dtype=object),
 array(['New York Times'], dtype=object),
 array(['CNN'], dtype=object),
 array(['New York Times'], dtype=object),
 array(['Breitbart News'], dtype=object),
 array(['New Zealand Herald'], dtype=object),
 array(['The Guardian'], dtype=object),
 array(['The Intercept'], dtype=object),
 array(['The Verge'], dtype=object),
 array(['Breitbart News'], dtype=object),
 array(['Raw Story'], dtype=object),
 array(['New York Times'], dtype=object),
 array(['Narendra Modi (press release) (blog)'], dtype=object),
 array(['Telegraph.co.uk'], dtype=object),
 array(['The Verge'], dtype=object),
 array(['CNNMoney'], dtype=object),
 array(['Mashable'], dtype=object),
 array(['New York Times'], dtype=object),
 array(['Winnipeg Free Press'], dtype=object),
 array(['Harvard Business Review'], dtype=object),
 array(['Harva

## Best News Sources ^

In [118]:
Best_Topics

[array(['microsoft'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['obama'], dtype=object),
 array(['obama'], dtype=object),
 array(['economy'], dtype=object),
 array(['obama'], dtype=object),
 array(['obama'], dtype=object),
 array(['obama'], dtype=object),
 array(['economy'], dtype=object),
 array(['economy'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['obama'], dtype=object),
 array(['obama'], dtype=object),
 array(['economy'], dtype=object),
 array(['economy'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['economy'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['obama'], dtype=object),
 array(['economy'], dtype=object),
 array(['economy'], dtype=object),
 array(['economy'], dtype=object),
 array(['microsoft'], dtype=object),
 array(['economy'], dtype=object),
 array(['economy'], dtype=object),
 array(['microsoft

# Popular Topics ^

In [None]:
def preprocess(df1):
    start_time = time.clock()
    df1.pop('PublishDate')
    df1.pop("IDLink")
    df1.pop('Topic')
    df1.pop('Source')
    print('Dropped PublishDate and IDLink')
    
    #df1['Source']=df1['Source'].fillna('unknown')
    #print('Nans for Source Treated!')
    Title=[]
    Headline=[]
   # Source=[]
  
    for i in df1['Title']:
        letters_only = re.sub("[^a-zA-Z]"," ",i)
        Title.append(letters_only)

    for i in df1['Headline']:
        letters_only = re.sub("[^a-zA-Z]"," ",i)
        Headline.append(letters_only)
        
    #for i in df1['Source']:
        #letters_only = re.sub("[^a-zA-Z]"," ",i)
        #Source.append(letters_only)
      
      
    df1['Title']=Title    
    df1['Headline']=Headline
    #df1['Source']=Source
    print('Title and Headline corpus with only Letters')
   

    df1['Title'] = df1['Title'].str.lower()
    df1['Headline'] = df1['Headline'].str.lower()
    #df1['Source'] = df1['Source'].str.lower()
    print('Title and Headline converted to lower case')

    df1['Title'] = df1['Title'].str.strip()
    df1['Headline'] = df1['Headline'].str.strip()
    #df1['Source'] = df1['Source'].str.strip()
    print('Title and Headline Stripped for blank spaces')
    
    #for i,l in enumerate(df1['Source']):
        #k=df1['Source'][i].replace(" ", "")
        #df1['Source'][i]=k
    #print('Source Stripped for blank spaces inside')
    
    df1['Title'] = df1['Title'].str.split()  
    df1['Headline'] = df1['Headline'].str.split() 
  
    stop = stopwords.words("english")
  
    def removestopwords(y):   
        stopwordremoved =[w for w in y if not w in stop]
        return(" ".join(stopwordremoved))
    column_size = df1.shape[0]
  
    cleaned1 = []
    cleaned2 = []
    for i in range( 0, column_size):
        cleaned1.append(removestopwords(df1['Title'][i]))
        cleaned2.append(removestopwords(df1['Headline'][i]))
  
    df1['Title'] = cleaned1
    df1['Headline'] = cleaned2
    print('Title and Headline :Removed Stopwords')

    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        lemm = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
        return(" ".join(lemm)) 

    clean1=[]
    clean2=[]
    for i in range( 0, column_size):
        clean1.append(lemmatize_text(df1['Title'][i]))
        clean2.append(lemmatize_text(df1['Headline'][i]))
  
    df1['Title']=clean1
    df1['Headline']=clean2
    print('Title and Headline :Lemmatized')
  
    porter_stemmer = PorterStemmer()
    def stemmer(text):
        stem = [porter_stemmer.stem(w) for w in w_tokenizer.tokenize(text)]
        return(" ".join(stem)) 
    stem1=[]
    stem2=[]
    for i in range( 0, column_size):
        stem1.append(stemmer(df1['Title'][i]))
        stem2.append(stemmer(df1['Headline'][i]))
    df1['Title']=stem1
    df1['Headline']=stem2
    print('Title and Headline :Stemmed')

    #df1=pd.concat((df1.drop(['Topic'],axis=1 ), pd.get_dummies(df1['Topic'])) , axis=1)
    #print('One-hot Encoding for Topic')
    print('Time took for preprocessing',time.clock() - start_time, "seconds")
    return df1

In [None]:
train=preprocess(df)

In [None]:
test=preprocess(test)

In [None]:
#import dateutil.parser as dparser
#dparser.parse(df['PublishDate'][55927],fuzzy=True)

# Model1

In [None]:
data1=train
data2=test

In [None]:
data1.head(3)

In [None]:
data2.head(3)

In [None]:
#for i,l in enumerate(data1['Source']):
 # k=data1['Source'][i].replace(" ", "")
 # data1['Source'][i]=k

In [None]:
start_time = time.clock()
data1=pd.concat([data1,pd.DataFrame(Normalizer().fit_transform(data1.iloc[:,2:5]), columns=['Fb', 'Gp', 'LI'])], axis=1)
data1=data1.drop(['Facebook', 'GooglePlus','LinkedIn' ], axis=1)
data2=pd.concat([data2,pd.DataFrame(Normalizer().fit_transform(data2.iloc[:,2:5]), columns=['Fb', 'Gp', 'LI'])], axis=1)
data2=data2.drop(['Facebook', 'GooglePlus','LinkedIn' ], axis=1)
print('Time took for Normalizing',time.clock() - start_time, "seconds")

In [None]:
data1.head(3)

In [None]:
data2.head(3)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)

In [None]:
X_train=pd.concat([pd.DataFrame(vectorizer.fit_transform(data1['Title']).toarray()) , pd.DataFrame(vectorizer.fit_transform(data1['Headline']).toarray())], axis=1)

In [None]:
X_test=pd.concat([pd.DataFrame(vectorizer.fit_transform(data2['Title']).toarray())  , pd.DataFrame(vectorizer.fit_transform(data2['Headline']).toarray())], axis=1)

In [None]:
data1=data1.drop(['Title', 'Headline'],axis=1)
data2=data2.drop(['Title', 'Headline'],axis=1)

In [None]:
X_train=pd.concat([X_train, data1.drop(['SentimentTitle','SentimentHeadline'], axis=1)], axis=1)

In [None]:
X_test=pd.concat([X_test, data2], axis=1)

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

In [None]:
y_train=data1.loc[:,'SentimentTitle':'SentimentHeadline']

In [None]:
y_train.head(3)

In [None]:
#This was done evaluating the model on training before moving to validation.

#X_train, X_test, Y_train, Y_test = train_test_split(e,y, test_size = 0.30, random_state = 143)
#X_train.shape

## Best_Model : MLP Regressor

In [None]:
neu=MLPRegressor(hidden_layer_sizes=(100, ), 
             activation='relu', solver='adam', 
             alpha=0.0001, batch_size=32, 
             learning_rate='adaptive', 
             learning_rate_init=0.001, 
             power_t=0.5, max_iter=200, 
             shuffle=True, random_state=None, 
             tol=0.0001, verbose=False, 
             warm_start=False, momentum=0.9, 
             nesterovs_momentum=True, early_stopping=False,
             validation_fraction=0.1, beta_1=0.9,
             beta_2=0.999, epsilon=1e-08,
             n_iter_no_change=10, max_fun=15000)

In [None]:
neu.fit(X_train, y_train)

In [None]:
neu.score(X_train, y_train)

In [None]:
pred=neu.predict(X_test)
MLP500=pd.DataFrame(pred, columns=['SentimentTitle', 'SentimentHeadline'], index=idx.index)
MLP500.to_csv('/kaggle/working/MLP500.csv')

## This Showed an Accuracy of 88.022

##  -----------------------------------------------------------------------------------

## Model 2: MultiOutputregressor with embedded RandomForest

In [None]:
mo= MultiOutputRegressor(RandomForestRegressor(n_jobs=-1), n_jobs=-1)
mo.fit(X_train,y_train)

In [None]:
mo.score(X_train, y_train)
pred1=mo.predict(X_test)
Ans2=pd.DataFrame(pred1, columns=['SentimentTitle', 'SentimentHeadline'], index=idx.index)
Ans2.to_csv('/kaggle/working/Ans2.csv')

## This Showed an Accuracy of 88.7

## ------------------------------------------------------------------------------------

## Model 3 Neural Networks.

In [None]:
!pip install -U tensorflow==2.0.0 --quiet
import tensorflow as tf

In [None]:
model=tf.keras.Sequential()
model.add(tf.keras.layers.Dense(128,input_dim = X_train.shape[1], activation='relu'))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(128,activation='relu'))
model.add(tf.keras.layers.Dense(2,activation='linear'))
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]]

In [None]:
model.fit(X_train,y_train, epochs=1, batch_size=32, validation_split = 0.30, callbacks=callbacks_list)

In [None]:
pred=model.predict(X_test)
idx=idx.set_index('IDLink')
ArpitNeural4=pd.DataFrame(pred, columns=['SentimentTitle', 'SentimentHeadline'], index=idx.index)
ArpitNeural4.to_csv('/kaggle/working/ArpitNeural4.csv')

# With this i got an accuracy of 88.26

I used various Neural models with multiple layers, trying to find the best accuracy.
Topic and Source were only. increasing the complexity of the models so dropped them.

## I used multiple combinations of Features with various models, of all of them some I have shown which are the best.  Rest was everything when i was playing with the data.