In [13]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import sys
import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words

In [14]:
data_path = "input/twitter/five_ten.csv"
#data_path = sys.argv[1]
data = pd.read_csv(data_path,encoding='latin-1')
data_threads = data.head(7)

In [15]:
data_threads.columns

Index(['id', 'thread_number', 'timestamp', 'text', 'retweets', 'likes',
       'replies'],
      dtype='object')

In [16]:
final_data = data_threads.drop_duplicates(subset={"thread_number","text","retweets","likes","replies"})

In [17]:
final_thread_number = final_data['thread_number']
final_text = final_data['text']

In [19]:
import re
temp = []
for sentence in final_text:
    sentence = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', sentence)
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)        #Removing Punctuations
    
    words = [word for word in sentence.split()]   # Stemming and removing stopwords
    temp.append(words)

final_text = temp

In [20]:
print(final_text)

[['extraordinary', 'evidence', 'at', 'treasury', 'committee', 'from', 'jon', 'thompson', 'ceo', 'of', 'hmrc', 'on', 'customs', 'and', 'brexit', 'today'], ['the', 'brexiter', 'favourite', 'max', 'fac', '-', 'would', 'cost', 'business', 'between', 'â£17', 'and', 'â£20bn', 'a', 'year', '-', 'thats', 'almost', '1%', 'of', 'gdp', '-', 'jusâ¦'], ['how', 'does', 'he', 'arrive', 'at', 'the', 'figure', '200m', 'export', 'consignments', 'at', 'an', 'average', 'cost', 'of', 'â£32', '50', 'each', '=', 'â£6', '5bn', 'times', 'two', 'beâ¦'], ['theresa', 'mays', 'new', 'customs', 'partnership', 'is', 'much', 'cheaper', 'for', 'business', 'almost', 'zero', 'cost', 'because', 'it', 'seeks', 'to', 'replicatâ¦'], ['mr', 'thompson', 'said', 'he', 'did', 'not', 'expect', 'the', 'eu', 'to', 'reciprocate', 'over', 'the', 'customs', 'partnership', 'what', 'that', 'means', 'is', 'uk', 'collâ¦'], ['both', 'would', 'not', 'be', 'ready', 'by', '2021', 'max', 'fac', 'needs', '3', 'years', 'customs', 'partnership',

In [21]:
def combine_words_to_sentence(final_text):    
    temp = []
    for row in final_text:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        temp.append(sequ)
    return temp

In [22]:
final_text = combine_words_to_sentence(final_text)
print(final_text)

[' extraordinary evidence at treasury committee from jon thompson ceo of hmrc on customs and brexit today', ' the brexiter favourite max fac - would cost business between â£17 and â£20bn a year - thats almost 1% of gdp - jusâ¦', ' how does he arrive at the figure 200m export consignments at an average cost of â£32 50 each = â£6 5bn times two beâ¦', ' theresa mays new customs partnership is much cheaper for business almost zero cost because it seeks to replicatâ¦', ' mr thompson said he did not expect the eu to reciprocate over the customs partnership what that means is uk collâ¦', ' both would not be ready by 2021 max fac needs 3 years customs partnership requires 5 mr thompson said the bordâ¦', ' we think we can manage the risk - we think we can he said he didnt sound so sure and the potential backdoorâ¦']


In [23]:
count_vect = CountVectorizer(max_features=5000)
vect_data = count_vect.fit_transform(final_text)
print(vect_data)

  (0, 86)	1
  (0, 20)	1
  (0, 9)	1
  (0, 31)	1
  (0, 64)	1
  (0, 48)	1
  (0, 63)	1
  (0, 25)	1
  (0, 83)	1
  (0, 52)	1
  (0, 45)	1
  (0, 28)	1
  (0, 87)	1
  (0, 11)	1
  (0, 37)	1
  (0, 40)	1
  (1, 53)	1
  (1, 46)	1
  (1, 7)	1
  (1, 79)	1
  (1, 93)	1
  (1, 3)	1
  (1, 0)	1
  (1, 16)	1
  (1, 22)	1
  :	:
  (5, 62)	1
  (5, 73)	1
  (5, 58)	1
  (5, 66)	1
  (5, 92)	1
  (5, 41)	1
  (5, 55)	1
  (5, 80)	1
  (5, 31)	1
  (5, 83)	1
  (6, 13)	1
  (6, 67)	1
  (6, 77)	1
  (6, 75)	1
  (6, 76)	1
  (6, 33)	1
  (6, 72)	1
  (6, 54)	1
  (6, 24)	2
  (6, 82)	2
  (6, 90)	4
  (6, 73)	1
  (6, 47)	2
  (6, 80)	2
  (6, 9)	1


In [32]:
sim_mat = np.zeros([len(final_text), len(final_text)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(final_text)):
  for j in range(len(final_text)):
    if i != j:
       sim_mat[i][j] = cosine_similarity(vect_data[i], vect_data[j])[0,0]

import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(final_text)), reverse=True)

print(ranked_sentences)
for i in ranked_sentences:
    print(i)
    print("\n")
    
# Extract top 10 sentences as the summary
# for i in range(10):
#   print(ranked_sentences)

[(0.20212678443752619, ' mr thompson said he did not expect the eu to reciprocate over the customs partnership what that means is uk collâ¦'), (0.1748943846754864, ' both would not be ready by 2021 max fac needs 3 years customs partnership requires 5 mr thompson said the bordâ¦'), (0.15738839248229636, ' the brexiter favourite max fac - would cost business between â£17 and â£20bn a year - thats almost 1% of gdp - jusâ¦'), (0.12421085030244729, ' how does he arrive at the figure 200m export consignments at an average cost of â£32 50 each = â£6 5bn times two beâ¦'), (0.1158047240347245, ' we think we can manage the risk - we think we can he said he didnt sound so sure and the potential backdoorâ¦'), (0.11414187895075702, ' extraordinary evidence at treasury committee from jon thompson ceo of hmrc on customs and brexit today'), (0.11143298511676256, ' theresa mays new customs partnership is much cheaper for business almost zero cost because it seeks to replicatâ¦')]
(0.20212678443752619, 