In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./dataset/news.csv")

In [3]:
data.head()

Unnamed: 0,id,headline,text
0,uid-1,Market Advances 5.12 More,NEW YORK (AP) - A prime rate reduction by New ...
1,uid-2,District Boosts Request For Anti-Terrorism Aid...,Mayor Anthony A. Williams petitioned the White...
2,uid-3,"Election? Here's How You Do It, Mate.",From our downunder perspective here in Austral...
3,uid-4,The Biggest Boom Ever,We are about to rewrite history. Unless a rece...
4,uid-5,Economic Aide Sees Uptrend,"Sedate and scholarly Dr. Arthur Burns, the ex-..."


In [4]:
data.shape

(3000, 3)

In [5]:
data.headline.value_counts()

Business and Finance                                                                                                                                       42
DIGEST                                                                                                                                                     27
Ahead of the Tape                                                                                                                                          24
Key Interest Rates                                                                                                                                          7
A Special Background Report On Trends in Industry And Finance                                                                                               7
World Watch                                                                                                                                                 6
A Special Summary and Forecast Of Federal and State 

In [6]:
data.isnull().sum()

id          0
headline    0
text        0
dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [8]:
data["headline"] = le.fit_transform(data["headline"])
data.head()

Unnamed: 0,id,headline,text
0,uid-1,1529,NEW YORK (AP) - A prime rate reduction by New ...
1,uid-2,678,Mayor Anthony A. Williams petitioned the White...
2,uid-3,849,From our downunder perspective here in Austral...
3,uid-4,2352,We are about to rewrite history. Unless a rece...
4,uid-5,809,"Sedate and scholarly Dr. Arthur Burns, the ex-..."


In [9]:
data.headline.nunique()

2873

In [10]:
list = []
for txt in data.text:
    list.append(len(txt))
    

In [11]:
max(list)

4988

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 5000)
vectorizer


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
text_vec = vectorizer.fit_transform(data.text)

In [14]:
vectorizer.get_feature_names()

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '101',
 '104',
 '105',
 '107',
 '108',
 '109',
 '11',
 '110',
 '112',
 '113',
 '114',
 '115',
 '116',
 '12',
 '120',
 '122',
 '123',
 '125',
 '13',
 '130',
 '133',
 '14',
 '140',
 '145',
 '15',
 '150',
 '155',
 '156',
 '157',
 '16',
 '160',
 '17',
 '170',
 '175',
 '18',
 '180',
 '19',
 '1929',
 '1930s',
 '1953',
 '1954',
 '1955',
 '1957',
 '1958',
 '1959',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '20th',
 '21',

In [15]:
text_dataframe = pd.DataFrame(text_vec.toarray(), columns=vectorizer.get_feature_names())
text_dataframe.head()

Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,yields,york,you,young,younger,your,youth,yuan,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.108677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.209282,0.0,0.0,0.107125,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
features = pd.concat([data, text_dataframe], axis = 1)

In [17]:
features = features.drop("text", axis = 1)

In [18]:
features.shape

(3000, 5002)

In [19]:
features = features.set_index("id")

In [20]:
features.head()

Unnamed: 0_level_0,headline,00,000,01,02,03,04,05,06,07,...,yields,york,you,young,younger,your,youth,yuan,zero,zone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
uid-1,1529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.108677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
uid-2,678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
uid-3,849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.209282,0.0,0.0,0.107125,0.0,0.0,0.0,0.0
uid-4,2352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
uid-5,809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
np.savetxt('tf.txt', features.to_array(), delimiter=' ')

In [21]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

In [None]:
# for k in [2, 10, 50, 100, 200, 500, 1000]:
    
#     clusterer = GaussianMixture(n_components= k).fit(features)
    
#     preds = clusterer.predict(features)
    
#     print(silhouette_score(features, preds, random_state=0))

In [None]:
from sklearn.cluster import KMeans

for k in [2, 10, 50, 100, 200, 500, 1000]:
    
        
    clusterer = KMeans(n_clusters=k, random_state=0).fit(features)
    
    preds = clusterer.predict(features)
    
    print(silhouette_score(features, preds, random_state=0))


0.6311212824779112
0.5458259868918389
0.5227699460130845
0.510988980828351
0.48272537283063804
0.3902816979995695


In [None]:
clusterer = KMeans(n_clusters=k, random_state=0).fit(features)
preds = clusterer.predict(features)
print(silhouette_score(features, preds, random_state=0))

features["cluster"] = preds

features["cluster"].to_csv("cluster_result.csv")
