In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

## Twitter Data Now

In [3]:
twitter = pd.read_csv('../Data/twittesr.csv')

In [4]:
twitter['text'].dropna(inplace = True)

### Count Vectorizing Twitter

In [9]:
cvec = pickle.load(open('../Assets/cvec.pkl', 'rb'))

In [10]:
twitter_cvec = cvec.transform(twitter['text'])

twitter_cvec_df = pd.DataFrame(twitter_cvec.toarray(), columns=cvec.get_feature_names())

In [17]:
# There are 17 too many columns here... Should be 1690
twitter_cvec_df.head()

Unnamed: 0,aba,aba woman,abandoned,abc,abc news,ablaze,able,absolutely,accident,according,...,young,youth,youth saved,youtube,youtube playlist,youtube video,yr,yr old,yyc,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# And 9 too many here. Should be 1097
twitter_tfidf_df.head()

Unnamed: 0,aba,abc,abc news,ablaze,accident,action,actually,added,affected,affected fatal,...,yes,yo,york,young,youth,youtube,youtube video,yr,yr old,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### TFIDF Vectorizing Twitter

In [11]:
tfidf = pickle.load(open('../Assets/tfidf.pkl', 'rb'))

In [12]:
twitter_tfidf = tfidf.transform(twitter['text'])

twitter_tfidf_df = pd.DataFrame(twitter_tfidf.toarray(), columns = tfidf.get_feature_names())

### Modeling

In [13]:
mnb = pickle.load(open('../Assets/multi_bayes.sav', 'rb'))

In [16]:
twitter_cvec_df.shape

(1655, 1707)

In [14]:
cvec_preds = mnb.predict(twitter_cvec_df)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1690 is different from 1707)

In [100]:
twitter_preds = twitter.dropna(subset = ['text']).copy()
twitter_preds['prediction'] = cvec_preds
twitter_preds = twitter_preds[['text','prediction']]
twitter_preds['prediction'] = twitter_preds['prediction'].map({0:'False', 1:'True'})

In [102]:
twitter_preds.head(10)

Unnamed: 0,text,prediction
0,Imagine being on a cruise ship that’s in the m...,False
1,Este sitio web permite rastrear la propagación...,True
2,-le tengo más miedo al tiro bajo que al corona...,False
3,#CORONAVIRUS https://www.theguardian.com/world...,True
4,#coronavirus #quarantined #ChinaCoronaVirus #C...,False
5,Dont worry guys im sure alex jones has a testo...,False
6,GA Dept Public Health epidemiologist Cherie Dr...,True
7,Is this how Corona virus started?,False
8,http://dlvr.it/RQ0MjR Eunice Yoon@onlyyoontvDi...,False
9,$BARC $JPM Emerging-Market Bonds Have Edge Ove...,False
