In [1]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import sys
import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words

In [2]:
data_path = "input/twitter/five_ten.csv"
#data_path = sys.argv[1]
data = pd.read_csv(data_path,encoding='latin-1')
data_threads = data.head(7)

In [3]:
data_threads.columns

Index(['id', 'thread_number', 'timestamp', 'text', 'retweets', 'likes',
       'replies'],
      dtype='object')

In [4]:
final_data = data_threads.drop_duplicates(subset={"thread_number","text","retweets","likes","replies"})

In [5]:
final_thread_number = final_data['thread_number']
final_text = final_data['text']

In [6]:
import re
temp = []
for sentence in final_text:
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)        #Removing Punctuations
    
    words = [word for word in sentence.split()]   # Stemming and removing stopwords
    temp.append(words)

final_text = temp

In [7]:
print(final_text)

[['extraordinary', 'evidence', 'at', 'treasury', 'committee', 'from', 'jon', 'thompson', 'ceo', 'of', 'hmrc', 'on', 'customs', 'and', 'brexit', 'today', 'https:', 't', 'co', 'djhiqhmvwj'], ['the', 'brexiter', 'favourite', 'max', 'fac', '-', 'would', 'cost', 'business', 'between', 'â£17', 'and', 'â£20bn', 'a', 'year', '-', 'thats', 'almost', '1%', 'of', 'gdp', '-', 'jusâ¦', 'https:', 't', 'co', '0mwicwre4t'], ['how', 'does', 'he', 'arrive', 'at', 'the', 'figure', '200m', 'export', 'consignments', 'at', 'an', 'average', 'cost', 'of', 'â£32', '50', 'each', '=', 'â£6', '5bn', 'times', 'two', 'beâ¦', 'https:', 't', 'co', 'kxnku2qivo'], ['theresa', 'mays', 'new', 'customs', 'partnership', 'is', 'much', 'cheaper', 'for', 'business', 'almost', 'zero', 'cost', 'because', 'it', 'seeks', 'to', 'replicatâ¦', 'https:', 't', 'co', '0lcsjhah0h'], ['mr', 'thompson', 'said', 'he', 'did', 'not', 'expect', 'the', 'eu', 'to', 'reciprocate', 'over', 'the', 'customs', 'partnership', 'what', 'that', 'means',

In [8]:
def combine_words_to_sentence(final_text):    
    temp = []
    for row in final_text:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        temp.append(sequ)
    return temp

In [9]:
final_text = combine_words_to_sentence(final_text)
print(final_text)

[' extraordinary evidence at treasury committee from jon thompson ceo of hmrc on customs and brexit today https: t co djhiqhmvwj', ' the brexiter favourite max fac - would cost business between â£17 and â£20bn a year - thats almost 1% of gdp - jusâ¦ https: t co 0mwicwre4t', ' how does he arrive at the figure 200m export consignments at an average cost of â£32 50 each = â£6 5bn times two beâ¦ https: t co kxnku2qivo', ' theresa mays new customs partnership is much cheaper for business almost zero cost because it seeks to replicatâ¦ https: t co 0lcsjhah0h', ' mr thompson said he did not expect the eu to reciprocate over the customs partnership what that means is uk collâ¦ https: t co 9c3uhhnzgx', ' both would not be ready by 2021 max fac needs 3 years customs partnership requires 5 mr thompson said the bordâ¦ https: t co lulzgusir4', ' we think we can manage the risk - we think we can he said he didnt sound so sure and the potential backdoorâ¦ https: t co ti1nbbjfpu']


In [10]:
count_vect = CountVectorizer(max_features=5000)
vect_data = count_vect.fit_transform(final_text)
print(vect_data)

  (0, 38)	1
  (0, 30)	1
  (0, 55)	1
  (0, 95)	1
  (0, 23)	1
  (0, 12)	1
  (0, 35)	1
  (0, 72)	1
  (0, 53)	1
  (0, 71)	1
  (0, 28)	1
  (0, 91)	1
  (0, 58)	1
  (0, 50)	1
  (0, 32)	1
  (0, 96)	1
  (0, 14)	1
  (0, 42)	1
  (0, 45)	1
  (1, 1)	1
  (1, 59)	1
  (1, 51)	1
  (1, 10)	1
  (1, 87)	1
  (1, 102)	1
  :	:
  (5, 46)	1
  (5, 63)	1
  (5, 88)	1
  (5, 30)	1
  (5, 55)	1
  (5, 35)	1
  (5, 91)	1
  (6, 92)	1
  (6, 16)	1
  (6, 75)	1
  (6, 85)	1
  (6, 83)	1
  (6, 84)	1
  (6, 37)	1
  (6, 80)	1
  (6, 62)	1
  (6, 27)	2
  (6, 90)	2
  (6, 99)	4
  (6, 81)	1
  (6, 52)	2
  (6, 88)	2
  (6, 30)	1
  (6, 55)	1
  (6, 12)	1


In [11]:
sim_mat = np.zeros([len(final_text), len(final_text)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(final_text)):
  for j in range(len(final_text)):
    if i != j:
       sim_mat[i][j] = cosine_similarity(vect_data[i], vect_data[j])[0,0]

import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(final_text)), reverse=True)

# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences)

[(0.17665462772736204, ' mr thompson said he did not expect the eu to reciprocate over the customs partnership what that means is uk collâ¦ https: t co 9c3uhhnzgx'), (0.16355344275115066, ' both would not be ready by 2021 max fac needs 3 years customs partnership requires 5 mr thompson said the bordâ¦ https: t co lulzgusir4'), (0.15371271304510203, ' the brexiter favourite max fac - would cost business between â£17 and â£20bn a year - thats almost 1% of gdp - jusâ¦ https: t co 0mwicwre4t'), (0.1309279986183248, ' extraordinary evidence at treasury committee from jon thompson ceo of hmrc on customs and brexit today https: t co djhiqhmvwj'), (0.1302704193830502, ' how does he arrive at the figure 200m export consignments at an average cost of â£32 50 each = â£6 5bn times two beâ¦ https: t co kxnku2qivo'), (0.12754237222691778, ' theresa mays new customs partnership is much cheaper for business almost zero cost because it seeks to replicatâ¦ https: t co 0lcsjhah0h'), (0.11733842624809253,