In [120]:
import urllib, urllib.request
import requests
import xml.etree.ElementTree as ET
import dateutil.parser
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import plotly.express as px
import random 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

url = 'http://export.arxiv.org/api/query?search_query=cat:cs.LG&start=0&max_results=10000&sortBy=submittedDate&sortOrder=descending'


data = urllib.request.urlopen(url)
#print(data.read().decode('utf-8'))

In [1]:
plotly.colors.qualitative.Dark24

NameError: name 'plotly' is not defined

In [121]:
class ArxivAPI():
    

response = requests.get(url)
root = ET.fromstring(response.content)
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}

papers = []
for entry in root.findall('atom:entry', namespaces):
    paper = {}

    # Get title, summary, and date and add them to the paper dictionary
    paper['title'] = entry.find('atom:title', namespaces).text
    paper['summary'] = entry.find('atom:summary', namespaces).text
    paper['date'] = dateutil.parser.parse(entry.find('atom:published', namespaces).text)

    papers.append(paper)

# Convert list of dictionaries to DataFrame

df = pd.DataFrame(papers)


In [156]:
class ProcessingData():
    
# Assuming df is your dataframe and it's already loaded
documents = df['summary']

# Use tf-idf features for NMF.
custom_stop_words = list(ENGLISH_STOP_WORDS)
custom_stop_words.extend(['data','tasks','task','models','node','machine', 'datasets'])

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
tfidf = vectorizer.fit_transform(documents)

# Fit the NMF model
n_topics = 10  # Or however many topics you want
nmf = NMF(n_components=n_topics, random_state=1, max_iter=2000).fit(tfidf)

# Assign the topic with highest value as the document's topic
df['topic'] = np.argmax(nmf.transform(tfidf), axis=1) + 1

# Extract year from date
df['year'] = df['date'].dt.year

# Count the frequency of each topic in each year
topic_counts = df.groupby(['year', 'topic']).size().reset_index(name='counts')



In [161]:
n_top_words = 10  # Number of top words to display per topic
n_connected_words = 5

# `nmf.components_` is a matrix where each row corresponds to a topic and each column corresponds to a word.
# The value in each cell is the weight of the word for the given topic.
_dict = {}
for topic_idx, topic in enumerate(nmf.components_):
    #print(f"Topic #{topic_idx + 1}")
    
    # Get the indices of the top `n_top_words` words for this topic. 
    # `argsort()` returns the indices that would sort the array.
    top_word_indices = topic.argsort()[::-1][:n_top_words]

    # Map indices to words and print them.
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_word_indices]
    start_index = random.randint(0, len(top_words) - n_connected_words)
    
    connected_words = top_words[start_index : start_index + n_connected_words]

    print("Top words:", top_words)
    _dict[f"{topic_idx + 1}"] = ' '.join(connected_words)

topics_df = pd.DataFrame(_dict,index=[0]).T.reset_index()
topics_df.columns = ['topic','topics']
topics_df.topic = topics_df.topic.astype('int')
df2 = df.merge(topics_df, on='topic', how='left')

import plotly.express as px

# Convert 'date' to datetime, if it isn't already
df2['date'] = pd.to_datetime(df2['date'])

# Group by 'date' and 'topics', count the number of records in each group
df_grouped = df2.groupby([df2['date'].dt.date, 'topics']).size().reset_index(name='count')

# Create a wide format dataframe suitable for stacked bar chart
df_wide = df_grouped.pivot(index='date', columns='topics', values='count').reset_index().fillna(0)

# Plot stacked bar chart
fig = px.bar(df_wide, x='date', y=df_wide.columns[1:],
             labels={'value':'Frequency', 'date':'Date', 'variable':'Topic'},
             title='Topic Distribution Over Time for the Last 10,000 Machine Learning Papers on ArXiv')

fig.update_layout(barmode='stack')
fig.show()

Top words: ['learning', 'model', 'methods', 'domain', 'features', 'dataset', 'performance', 'feature', 'supervised', 'classification']
Top words: ['algorithm', 'optimization', 'problem', 'algorithms', 'gradient', 'optimal', 'function', 'stochastic', 'linear', 'problems']
Top words: ['graph', 'graphs', 'gnns', 'gnn', 'nodes', 'structure', 'information', 'networks', 'representation', 'temporal']
Top words: ['fl', 'privacy', 'federated', 'clients', 'client', 'local', 'communication', 'learning', 'model', 'training']
Top words: ['rl', 'policy', 'agent', 'reinforcement', 'offline', 'learning', 'agents', 'reward', 'policies', 'environment']
Top words: ['time', 'series', 'forecasting', 'temporal', 'multivariate', 'long', 'transformer', 'term', 'based', 'real']
Top words: ['language', 'llms', 'text', 'large', 'pre', 'tuning', 'fine', 'model', 'code', 'llm']
Top words: ['neural', 'networks', 'network', 'deep', 'training', 'layer', 'architecture', 'quantum', 'layers', 'model']
Top words: ['adver

In [3]:
!pip3 install einops

Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
     -------------------------------------- 42.2/42.2 kB 410.2 kB/s eta 0:00:00
Installing collected packages: einops
Successfully installed einops-0.6.1


In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("stabilityai/FreeWilly2", use_fast=False)
model = AutoModelForCausalLM.from_pretrained("stabilityai/FreeWilly2", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
system_prompt = "### System:\nYou are Free Willy, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n"

message = "Write me a poem please"
prompt = f"{system_prompt}### User: {message}\n\n### Assistant:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)

print(tokenizer.decode(output[0], skip_special_tokens=True))


Downloading (…)okenizer_config.json: 100%|██████████| 649/649 [00:00<?, ?B/s] 


ImportError: 
LlamaTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


